From a3a8d41844d8e847651c5da5b360bfc0f8b2f6f7 Mon Sep 17 00:00:00 2001 From: Tham Ha Thi Date: Thu, 13 Sep 2018 17:25:39 -0400 Subject: [PATCH 001/201] encryption (from apache/parquet-cpp github repo) --- cpp/src/parquet/column_reader.cc | 30 ++- cpp/src/parquet/column_reader.h | 2 +- cpp/src/parquet/column_writer.cc | 67 +++-- cpp/src/parquet/column_writer.h | 1 + cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/encryption.cc | 45 ++++ cpp/src/parquet/encryption.h | 55 ++++ cpp/src/parquet/file_reader.cc | 162 +++++++++-- cpp/src/parquet/file_writer.cc | 81 ++++-- cpp/src/parquet/file_writer.h | 5 +- cpp/src/parquet/metadata.cc | 290 +++++++++++++++++--- cpp/src/parquet/metadata.h | 52 +++- cpp/src/parquet/properties.h | 374 +++++++++++++++++++++++++- cpp/src/parquet/thrift.h | 101 +++++-- cpp/src/parquet/types.h | 58 ++++ 15 files changed, 1197 insertions(+), 128 deletions(-) create mode 100644 cpp/src/parquet/encryption.cc create mode 100644 cpp/src/parquet/encryption.h diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 288e48fe96b8..e47aa1e9c6f8 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -42,6 +42,8 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" // IWYU pragma: keep +#include "parquet/util/crypto.h" + using arrow::MemoryPool; using arrow::internal::checked_cast; @@ -114,11 +116,14 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, + std::shared_ptr encryption, ::arrow::MemoryPool* pool) - : stream_(stream), + : : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), seen_num_rows_(0), - total_num_rows_(total_num_rows) { + total_num_rows_(total_num_rows), + encryption_(encryption), + decryption_buffer_(AllocateBuffer(pool, 0)) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); } @@ -146,6 +151,10 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; + + // Encryption + std::shared_ptr encryption_; + std::shared_ptr decryption_buffer_; }; std::shared_ptr SerializedPageReader::NextPage() { @@ -169,7 +178,8 @@ std::shared_ptr SerializedPageReader::NextPage() { header_size = static_cast(buffer.size()); try { DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_); + &header_size, ¤t_page_header_, + encryption_.get()); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -198,6 +208,15 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } + // Decrypt it if we need to + if (encryption_ != nullptr) { + decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); + compressed_len = parquet_encryption::Decrypt( + encryption_, false, buffer, compressed_len, decryption_buffer_->mutable_data()); + + buffer = decryption_buffer_->data(); + } + // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. @@ -266,9 +285,10 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool) { + Compression::type codec, std::shared_ptr encryption, + ::arrow::MemoryPool* pool) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, pool)); + new SerializedPageReader(stream, total_num_rows, codec, encryption, pool)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 2b6ec9f3681e..47340368929f 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -80,7 +80,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, + Compression::type codec, std::shared_ptr encryption = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index d9d37ae376d0..34a3d63b6d57 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -44,6 +44,7 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" #include "parquet/types.h" +#include "parquet/util/crypto.h" namespace parquet { @@ -136,7 +137,9 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, - Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + Compression::type codec, + const std::shared_ptr& encryption, + ColumnChunkMetaDataBuilder* metadata, MemoryPool* pool = arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), @@ -145,7 +148,8 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_(0), data_page_offset_(0), total_uncompressed_size_(0), - total_compressed_size_(0) { + total_compressed_size_(0), + encryption_(encryption) { compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -167,10 +171,23 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_encoding(ToThrift(page.encoding())); dict_page_header.__set_is_sorted(page.is_sorted()); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + + std::shared_ptr encrypted_data_buffer = nullptr; + if (encryption_.get()) { + encrypted_data_buffer = std::static_pointer_cast( + AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); + output_data_len = parquet_encryption::Encrypt( + encryption_, false, compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_dictionary_page_header(dict_page_header); // TODO(PARQUET-594) crc checksum @@ -179,11 +196,11 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data->data(), compressed_data->size())); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; int64_t final_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(&final_pos)); @@ -234,10 +251,22 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.repetition_level_encoding())); data_page_header.__set_statistics(ToThrift(page.statistics())); + const uint8_t* output_data_buffer = compressed_data->data(); + int32_t output_data_len = static_cast(compressed_data->size()); + + std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); + if (encryption_.get()) { + encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); + output_data_len = parquet_encryption::Encrypt( + encryption_, false, compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); + output_data_buffer = encrypted_data_buffer->data(); + } + format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); - page_header.__set_compressed_page_size(static_cast(compressed_data->size())); + page_header.__set_compressed_page_size(static_cast(output_data_len)); page_header.__set_data_page_header(data_page_header); // TODO(PARQUET-594) crc checksum @@ -247,11 +276,11 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); - PARQUET_THROW_NOT_OK(sink_->Write(compressed_data->data(), compressed_data->size())); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; - total_compressed_size_ += compressed_data->size() + header_size; + total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); int64_t current_pos = -1; @@ -285,19 +314,22 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr compressor_; + std::shared_ptr encryption_; }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, - Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + Compression::type codec, + const std::shared_ptr& encryption, + ColumnChunkMetaDataBuilder* metadata, MemoryPool* pool = arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata) { - in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, pool)); - } + in_memory_sink_ = CreateOutputStream(pool); + pager_ = std::unique_ptr( + new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, pool)); + } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { return pager_->WriteDictionaryPage(page); @@ -340,13 +372,14 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, MemoryPool* pool, bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, metadata, pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, pool)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, metadata, pool)); + new SerializedPageWriter(sink, codec, encryption, metadata, pool)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 27ca400eb461..7d8a1e6a3057 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -84,6 +84,7 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, + const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool buffered_row_group = false); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index cee45c0c6f82..2d4711a64079 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -108,7 +108,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); std::unique_ptr pager = - PageWriter::Open(sink_, column_properties.compression(), metadata_.get()); + PageWriter::Open(sink_, column_properties.compression(), nullptr, metadata_.get()); std::shared_ptr writer = ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get()); return std::static_pointer_cast>(writer); diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc new file mode 100644 index 000000000000..0a2d9ef939a6 --- /dev/null +++ b/cpp/src/parquet/encryption.cc @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "encryption.h" + +#include + +namespace parquet { + +// integer key retriever +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { + key_map_.insert(std::make_pair(key_id, key)); +} + +const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { + uint32_t key_id; + memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); + + return key_map_[key_id]; +} + +// string key retriever +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { + key_map_.insert(std::make_pair(key_id, key)); +} + +const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { + return key_map_[key_id]; +} + +} // namespace parquet diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h new file mode 100644 index 000000000000..1dbf0d20f39e --- /dev/null +++ b/cpp/src/parquet/encryption.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_H +#define PARQUET_ENCRYPTION_H + +#include +#include +#include + +namespace parquet { + +class PARQUET_EXPORT DecryptionKeyRetriever { + public: + virtual const std::string& GetKey(const std::string& key_metadata) = 0; + virtual ~DecryptionKeyRetriever() {} +}; + +// Simple integer key retriever +class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(uint32_t key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); + + private: + std::map key_map_; +}; + +// Simple string key retriever +class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { + public: + void PutKey(const std::string& key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); + + private: + std::map key_map_; +}; + +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 6a11baedf826..970fbdf31d61 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -45,6 +45,7 @@ namespace parquet { static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; @@ -81,9 +82,13 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, - FileMetaData* file_metadata, int row_group_number, + FileMetaData* file_metadata, + FileCryptoMetaData* file_crypto_metadata, int row_group_number, const ReaderProperties& props) - : source_(source), file_metadata_(file_metadata), properties_(props) { + : source_(source), + file_metadata_(file_metadata), + file_crypto_metadata_(file_crypto_metadata), + properties_(props) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -118,13 +123,65 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); + std::unique_ptr crypto_meta_data = col->crypto_meta_data(); + + bool encrypted = true; + + // file is unencrypted + // or file is encrypted but column is unencrypted + if (!file_crypto_metadata_ || !crypto_meta_data) { + encrypted = false; + } + + if (!encrypted) { + return PageReader::Open(stream, col->num_values(), col->compression(), + nullptr, properties_.memory_pool()); + } + + // the column is encrypted + + auto file_decryption = properties_.file_decryption(); + + // the column is encrypted with footer key + if (crypto_meta_data->encrypted_with_footer_key()) { + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + + if (footer_key.empty()) { + throw ParquetException("column is encrypted with null footer key"); + } + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); + + return PageReader::Open(stream, col->num_values(), col->compression(), + footer_encryption, properties_.memory_pool()); + } + + // file is non-uniform encrypted and the column is encrypted with its own key + + std::string column_key_metadata = crypto_meta_data->column_key_metadata(); + // encrypted with column key + std::string column_key = + file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); + + if (column_key.empty()) { + throw ParquetException("column is encrypted with null key, path=" + + col->path_in_schema()->ToDotString()); + } + auto column_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, column_key, + file_decryption->GetAad()); + return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool()); + column_encryption, properties_.memory_pool()); } private: std::shared_ptr source_; FileMetaData* file_metadata_; + FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; }; @@ -145,7 +202,8 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); + new SerializedRowGroup(source_, file_metadata_.get(), + file_crypto_metadata_.get(), i, properties_)); return std::make_shared(std::move(contents)); } @@ -175,38 +233,96 @@ class SerializedFile : public ParquetFileReader::Contents { // Check if all bytes are read. Check if last 4 bytes read have the magic bits if (footer_buffer->size() != footer_read_size || - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0) { + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } - uint32_t metadata_len = arrow::util::SafeLoadAs( + // no encryption + if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { + uint32_t metadata_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = file_size - kFooterSize - metadata_len; + if (kFooterSize + metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (metadata_len + kFooterSize)) { + metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); + } + // encryption + else { + // read crypto metadata + uint32_t crypto_metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - kFooterSize); - int64_t metadata_start = file_size - kFooterSize - metadata_len; - if (kFooterSize + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } + int64_t crypto_metadata_start = file_size - kFooterSize - crypto_metadata_len; + + if (kFooterSize + crypto_metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (crypto_metadata_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - crypto_metadata_len - kFooterSize, crypto_metadata_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(crypto_metadata_start, crypto_metadata_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != crypto_metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + file_crypto_metadata_ = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + + int64_t footer_offset = file_crypto_metadata_->footer_offset(); + uint32_t footer_read_size = (uint32_t)(crypto_metadata_start - footer_offset); + + std::shared_ptr metadata_buffer = + SliceBuffer(footer_buffer, footer_offset, footer_read_size); + + if (file_crypto_metadata_->encrypted_footer()) { + // get footer key metadata + std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + + auto file_decryption = properties_.file_decryption(); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); - std::shared_ptr metadata_buffer; - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + kFooterSize)) { - metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size, + footer_encryption); + } else { + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size); } } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } private: std::shared_ptr source_; std::shared_ptr file_metadata_; + std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; }; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 22c75fa05fb1..64716400070a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -36,6 +36,7 @@ namespace parquet { // FIXME: copied from reader-internal.cc static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', 'E'}; // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -126,7 +127,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), col_meta, + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + properties_->encryption(column_descr->path()), col_meta, // TODO properties_->memory_pool()); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -224,7 +226,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, properties_->memory_pool(), buffered_row_group_); + properties_->encryption(column_descr->path()), col_meta, + properties_->memory_pool(), buffered_row_group_); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -265,7 +268,22 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*file_metadata_, sink_.get()); + + auto file_encryption = properties_->file_encryption(); + if (file_encryption == nullptr) { + WriteFileMetaData(*file_metadata_, sink_.get()); + } else { + uint64_t metadata_start = static_cast(sink_->Tell()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption.get()); + + auto crypto_metadata = metadata_->GetCryptoMetaData(metadata_start); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + } + + sink_->Close(); } } @@ -327,8 +345,12 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr row_group_writer_; void StartFile() { - // Parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + if (properties_->file_encryption() == nullptr) { + // Parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + } else { + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + } } }; @@ -364,25 +386,52 @@ std::unique_ptr ParquetFileWriter::Open( key_value_metadata); } -void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, + EncryptionProperties* footer_encryption) { + if (footer_encryption == nullptr) { + // Write MetaData + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); + + file_metadata.WriteTo(sink); + + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; + + // Write Footer + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + } else { + // encrypt and write to sink + file_metadata.WriteTo(sink, footer_encryption); + } +} + +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { + ParquetOutputWrapper wrapper(sink); + return WriteFileMetaData(file_metadata, &wrapper); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ArrowOutputStream* sink) { int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint64_t crypto_offset = static_cast(position); - // Write MetaData - uint32_t metadata_len = static_cast(position); - - file_metadata.WriteTo(sink); + // Get a FileCryptoMetaData + crypto_metadata.WriteTo(sink); PARQUET_THROW_NOT_OK(sink->Tell(&position)); - metadata_len = static_cast(position) - metadata_len; + auto crypto_len = static_cast(position) - crypto_offset; + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&crypto_len), 4)); - // Write Footer - PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(PARQUET_EMAGIC, 4)); } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink) { ParquetOutputWrapper wrapper(sink); - return WriteFileMetaData(file_metadata, &wrapper); + return WriteFileCryptoMetaData(crypto_metadata, &wrapper); } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 585e74465e3d..cf3cd1312511 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -94,7 +94,10 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + EncryptionProperties* encryption_properties = NULLPTR); +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + OutputStream* sink); PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ebf71c0f39f7..d48fd36bf6b1 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -122,6 +122,52 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d } // MetaData Accessor +// ColumnCryptoMetaData +class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { + public: + explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata) + : crypto_metadata_(crypto_metadata) {} + + ~ColumnCryptoMetaDataImpl() {} + + bool encrypted_with_footer_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY; + } + bool encrypted_with_column_key() const { + return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY; + } + const std::vector& path_in_schema() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema; + } + const std::string& column_key_metadata() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.column_key_metadata; + } + + private: + const format::ColumnCryptoMetaData* crypto_metadata_; +}; + +std::unique_ptr ColumnCryptoMetaData::Make( + const uint8_t* metadata) { + return std::unique_ptr(new ColumnCryptoMetaData(metadata)); +} + +ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata) + : impl_(new ColumnCryptoMetaDataImpl( + reinterpret_cast(metadata))) {} + +ColumnCryptoMetaData::~ColumnCryptoMetaData() {} + +const std::vector& ColumnCryptoMetaData::path_in_schema() const { + return impl_->path_in_schema(); +} +bool ColumnCryptoMetaData::encrypted_with_footer_key() const { + return impl_->encrypted_with_footer_key(); +} +const std::string& ColumnCryptoMetaData::column_key_metadata() const { + return impl_->column_key_metadata(); +} + // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: @@ -204,6 +250,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return column_->meta_data.total_uncompressed_size; } + inline std::unique_ptr crypto_meta_data() const { + if (column_->__isset.crypto_meta_data) { + return ColumnCryptoMetaData::Make( + reinterpret_cast(&column_->crypto_meta_data)); + } else { + return nullptr; + } + } + private: mutable std::shared_ptr possible_stats_; std::vector encodings_; @@ -281,6 +336,10 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +std::unique_ptr ColumnChunkMetaData::crypto_meta_data() const { + return impl_->crypto_meta_data(); +} + // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: @@ -345,11 +404,12 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, + std::shared_ptr encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get()); + metadata_.get(), encryption.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -377,9 +437,9 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst) const { + void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst); + serializer.Serialize(metadata_.get(), dst, encryption); } std::unique_ptr RowGroup(int i) { @@ -464,14 +524,17 @@ class FileMetaData::FileMetaDataImpl { }; std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len) { + uint32_t* metadata_len, + std::shared_ptr encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); + return std::shared_ptr( + new FileMetaData(metadata, metadata_len, encryption)); } -FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, + std::shared_ptr encryption) : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} + new FileMetaDataImpl(metadata, metadata_len, encryption))} {} FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -523,10 +586,68 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { - return impl_->WriteTo(dst); +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { + return impl_->WriteTo(dst, encryption); +} + +class FileCryptoMetaData::FileCryptoMetaDataImpl { + public: + FileCryptoMetaDataImpl() {} + + explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) { + metadata_.reset(new format::FileCryptoMetaData); + DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + metadata_len_ = *metadata_len; + } + + ~FileCryptoMetaDataImpl() {} + + EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + + bool encrypted_footer() { return metadata_->encrypted_footer; } + + const std::string& footer_key_metadata() { return metadata_->footer_key_metadata; } + + uint64_t footer_offset() { return metadata_->footer_offset; } + + void WriteTo(::arrow::io::OutputStream* dst) const { + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); + } + + private: + friend FileMetaDataBuilder; + std::unique_ptr metadata_; + uint32_t metadata_len_; +}; + +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() { + return impl_->encryption_algorithm(); +} +bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } +const std::string& FileCryptoMetaData::footer_key_metadata() { + return impl_->footer_key_metadata(); +} +uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } + +std::shared_ptr FileCryptoMetaData::Make( + const uint8_t* serialized_metadata, uint32_t* metadata_len) { + return std::shared_ptr( + new FileCryptoMetaData(serialized_metadata, metadata_len)); } +FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata, + uint32_t* metadata_len) + : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {} + +FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {} + +FileCryptoMetaData::~FileCryptoMetaData() {} + +void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } + ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) : application_(application), version{major, minor, patch, "", "", ""} {} @@ -657,7 +778,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // column metadata void SetStatistics(const EncodedStatistics& val) { - column_chunk_->meta_data.__set_statistics(ToThrift(val)); + column_metadata_.__set_statistics(ToThrift(val)); } void Finish(int64_t num_values, int64_t dictionary_page_offset, @@ -665,19 +786,20 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback) { if (dictionary_page_offset > 0) { - column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); + column_metadata_.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); } else { column_chunk_->__set_file_offset(data_page_offset + compressed_size); } - column_chunk_->__isset.meta_data = true; - column_chunk_->meta_data.__set_num_values(num_values); + + column_metadata_.__set_num_values(num_values); if (index_page_offset >= 0) { - column_chunk_->meta_data.__set_index_page_offset(index_page_offset); + column_metadata_.__set_index_page_offset(index_page_offset); } - column_chunk_->meta_data.__set_data_page_offset(data_page_offset); - column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); - column_chunk_->meta_data.__set_total_compressed_size(compressed_size); + column_metadata_.__set_data_page_offset(data_page_offset); + column_metadata_.__set_total_uncompressed_size(uncompressed_size); + column_metadata_.__set_total_compressed_size(compressed_size); + std::vector thrift_encodings; if (has_dictionary) { thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding())); @@ -695,12 +817,62 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { if (dictionary_fallback) { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } - column_chunk_->meta_data.__set_encodings(thrift_encodings); + column_metadata_.__set_encodings(thrift_encodings); } void WriteTo(::arrow::io::OutputStream* sink) { ThriftSerializer serializer; - serializer.Serialize(column_chunk_, sink); + const auto& encrypt_md = properties_->column_encryption_props(column_->path()); + + // column is unencrypted + if (!encrypt_md || !encrypt_md->encrypted()) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + + serializer.Serialize(column_chunk_, sink); + } else { // column is encrypted + column_chunk_->__isset.crypto_meta_data = true; + + // encrypted with footer key + format::ColumnCryptoMetaData ccmd; + if (encrypt_md->encrypted_with_footer_key()) { + ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; + ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); + } else { // encrypted with column key + format::EncryptionWithColumnKey eck; + eck.__set_column_key_metadata(encrypt_md->key_metadata()); + eck.__set_path_in_schema(column_->path()->ToDotVector()); + ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; + ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); + } + column_chunk_->__set_crypto_meta_data(ccmd); + + auto footer_encryption = properties_->footer_encryption(); + + // non-uniform: footer is unencrypted, or column is encrypted with a column-specific + // key + if ((footer_encryption == nullptr && encrypt_md->encrypted()) || + !encrypt_md->encrypted_with_footer_key()) { + // don't set meta_data + column_chunk_->__isset.meta_data = false; + + // Thrift-serialize the ColumnMetaData structure, + // encrypt it with the column key, and write the result to the output stream + // (first length, then buffer) + auto encrypt_props = properties_->encryption(column_->path()); + uint64_t metadata_start = sink->Tell(); + + serializer.Serialize(&column_metadata_, sink, encrypt_props.get()); + + // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. + column_chunk_->__set_file_offset(metadata_start); + } else { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + } + + serializer.Serialize(column_chunk_, sink); + } } const ColumnDescriptor* descr() const { return column_; } @@ -708,14 +880,15 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; - column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type())); - column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector()); - column_chunk_->meta_data.__set_codec( - ToThrift(properties_->compression(column_->path()))); + column_metadata_ = column_chunk_->meta_data; + column_metadata_.__set_type(ToThrift(column_->physical_type())); + column_metadata_.__set_path_in_schema(column_->path()->ToDotVector()); + column_metadata_.__set_codec(ToThrift(properties_->compression(column_->path()))); } format::ColumnChunk* column_chunk_; std::unique_ptr owned_column_chunk_; + format::ColumnMetaData column_metadata_; const std::shared_ptr properties_; const ColumnDescriptor* column_; }; @@ -808,20 +981,22 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { << " columns are initialized"; throw ParquetException(ss.str()); } - int64_t total_byte_size = 0; - - for (int i = 0; i < schema_->num_columns(); i++) { - if (!(row_group_->columns[i].file_offset >= 0)) { - std::stringstream ss; - ss << "Column " << i << " is not complete."; - throw ParquetException(ss.str()); - } - total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; - } - DCHECK(total_bytes_written == total_byte_size) - << "Total bytes in this RowGroup does not match with compressed sizes of columns"; - - row_group_->__set_total_byte_size(total_byte_size); + // int64_t total_byte_size = 0; + + // for (int i = 0; i < schema_->num_columns(); i++) { + // if (!(row_group_->columns[i].file_offset >= 0)) { + // std::stringstream ss; + // ss << "Column " << i << " is not complete."; + // throw ParquetException(ss.str()); + // } + // total_byte_size += row_group_->columns[i].meta_data.total_compressed_size; + // } + // DCHECK(total_bytes_written == total_byte_size) + // << "Total bytes in this RowGroup does not match with compressed sizes of + // columns"; + + // row_group_->__set_total_byte_size(total_byte_size); + row_group_->__set_total_byte_size(total_bytes_written); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -882,6 +1057,9 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); + if (props->footer_encryption() != nullptr) { + crypto_metadata_.reset(new format::FileCryptoMetaData()); + } } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -947,8 +1125,39 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } + std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) { + if (crypto_metadata_ == nullptr) { + return nullptr; + } + + auto file_encryption = properties_->file_encryption(); + auto footer_encryption = properties_->footer_encryption(); + + // build format::FileCryptoMetaData + EncryptionAlgorithm encryption_algorithm; + encryption_algorithm.algorithm = footer_encryption->algorithm(); + encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); + crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); + crypto_metadata_->__set_encrypted_footer(!footer_encryption->key().empty()); + + std::string footer_key_metadata = file_encryption->footer_key_metadata(); + if (!footer_key_metadata.empty()) { + crypto_metadata_->__set_footer_key_metadata(footer_key_metadata); + } + crypto_metadata_->__set_footer_offset(footerOffset); + + // TODO set iv_prefix??? + + // return as FileCryptoMetaData + std::unique_ptr file_crypto_meta_data = + std::unique_ptr(new FileCryptoMetaData()); + file_crypto_meta_data->impl_->metadata_ = std::move(crypto_metadata_); + return file_crypto_meta_data; + } + protected: std::unique_ptr metadata_; + std::unique_ptr crypto_metadata_; private: const std::shared_ptr properties_; @@ -980,4 +1189,9 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData( + uint64_t footerOffset) { + return impl_->BuildFileCryptoMetaData(footerOffset); +} + } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 7df4f6d9e2a6..2ec7e5b17f75 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -97,6 +97,22 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +class PARQUET_EXPORT ColumnCryptoMetaData { + public: + static std::unique_ptr Make(const uint8_t* metadata); + ~ColumnCryptoMetaData(); + + const std::vector& path_in_schema() const; + bool encrypted_with_footer_key() const; + const std::string& column_key_metadata() const; + + private: + explicit ColumnCryptoMetaData(const uint8_t* metadata); + + class ColumnCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor @@ -127,6 +143,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; + std::unique_ptr crypto_meta_data() const; private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, @@ -167,7 +184,8 @@ class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); + uint32_t* metadata_len, + std::shared_ptr encryption = NULLPTR); ~FileMetaData(); @@ -186,7 +204,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst) const; + void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -201,7 +219,8 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, + std::shared_ptr encryption = NULLPTR); // PIMPL Idiom FileMetaData(); @@ -209,6 +228,30 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +class PARQUET_EXPORT FileCryptoMetaData { + public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const uint8_t* serialized_metadata, + uint32_t* metadata_len); + ~FileCryptoMetaData(); + + EncryptionAlgorithm encryption_algorithm(); + bool encrypted_footer(); + const std::string& footer_key_metadata(); + uint64_t footer_offset(); + + void WriteTo(::arrow::io::OutputStream* dst) const; + + private: + friend FileMetaDataBuilder; + FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + + // PIMPL Idiom + FileCryptoMetaData(); + class FileCryptoMetaDataImpl; + std::unique_ptr impl_; +}; + // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { public: @@ -293,6 +336,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); + // crypto metadata + std::unique_ptr GetCryptoMetaData(uint64_t footerOffset); + private: explicit FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 209969a00544..a54162bb55ea 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -25,11 +25,13 @@ #include "arrow/type.h" +#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "arrow/util/logging.h" namespace parquet { @@ -40,6 +42,127 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 1024; static bool DEFAULT_USE_BUFFERED_STREAM = false; +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class Builder { + public: + Builder(const std::string& path, bool encrypt) + : path_(path), encrypt_(encrypt), encrypted_with_footer_key_(encrypt) {} + + Builder* key(const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + DCHECK(encrypt_); + + key_ = key; + return this; + } + Builder* key_metadata(const std::string& key_id) { + DCHECK(!key_id.empty()); + key_metadata_ = key_id; + return this; + } + + Builder* key_id(uint32_t key_id) { + std::string key_metadata = std::string(reinterpret_cast(&key_id), 4); + this->key_metadata(key_metadata); + return this; + } + + std::shared_ptr build() { + return std::make_shared( + path_, encrypt_, encrypted_with_footer_key_, key_, key_metadata_); + } + + private: + std::string path_; + bool encrypt_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + }; + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + ColumnEncryptionProperties(const std::string& path, bool encrypt, + bool encrypted_with_footer_key, const std::string& key, + const std::string& key_metadata) + : path_(path), + encrypt_(encrypt), + encrypted_with_footer_key_(encrypted_with_footer_key), + key_(key), + key_metadata_(key_metadata) {} + + const std::string& path() const { return path_; } + bool encrypted() const { return encrypt_; } + bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } + + private: + std::string path_; + bool encrypt_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + FileDecryptionProperties(const std::string& footer_key) : footer_key_(footer_key) { + DCHECK(footer_key_.length() == 16 || footer_key_.length() == 24 || + footer_key_.length() == 32); + } + + FileDecryptionProperties(const std::shared_ptr& key_retriever) + : key_retriever_(key_retriever) {} + + void SetAad(const std::string& aad) { aad_ = aad; } + + void SetColumnKey(const std::string& name, const std::string& key) { + SetColumnKey(std::vector({name}), key); + } + + void SetColumnKey(const std::vector& paths, const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + schema::ColumnPath columnPath(paths); + + column_keys_[columnPath.ToDotString()] = key; + } + + const std::string& GetColumnKey(const std::shared_ptr& columnPath, + const std::string& key_metadata = "") { + if (key_metadata.empty()) { + return column_keys_.at(columnPath->ToDotString()); + } + if (key_retriever_ == NULLPTR) { + throw ParquetException("no key retriever is provided for column key metadata"); + } + return key_retriever_->GetKey(key_metadata); + } + + const std::string& GetFooterKey(const std::string& footer_key_metadata = "") { + if (footer_key_metadata.empty()) { + return footer_key_; + } + if (key_retriever_ == NULLPTR) { + throw ParquetException("no key retriever is provided for footer key metadata"); + } + return key_retriever_->GetKey(footer_key_metadata); + } + const std::string& GetAad() { return aad_; } + + private: + std::string footer_key_; + std::string aad_; + + std::map column_keys_; + + std::shared_ptr key_retriever_; +}; + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) @@ -63,10 +186,17 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } + void file_decryption(const std::shared_ptr& decryption) { + file_decryption_ = decryption; + } + + FileDecryptionProperties* file_decryption() { return file_decryption_.get(); } + private: MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + std::shared_ptr file_decryption_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -83,6 +213,10 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; +static constexpr Encryption::type DEFAULT_ENCRYPTION_ALGORITHM = Encryption::AES_GCM_V1; +static constexpr int32_t MAXIMAL_KEY_METADATA_LENGTH = 256; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPT_THE_REST = true; class PARQUET_EXPORT ColumnProperties { public: @@ -131,6 +265,195 @@ class PARQUET_EXPORT ColumnProperties { size_t max_stats_size_; }; +class PARQUET_EXPORT FileEncryptionProperties { + public: + class Builder { + public: + Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) {} + + Builder(const std::string& key) + : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + footer_key_ = key; + } + + Builder* algorithm(Encryption::type algorithm) { + algorithm_ = algorithm; + return this; + } + + Builder* footer_key(const std::string& key) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + footer_key_ = key; + return this; + } + + Builder* footer_key_metadata(const std::string& key_metadata) { + DCHECK(!footer_key_.empty()); + DCHECK(!key_metadata.empty() && + key_metadata.length() < MAXIMAL_KEY_METADATA_LENGTH); + footer_key_metadata_ = key_metadata; + return this; + } + + Builder* aad(const std::string& aad) { + DCHECK(!aad.empty()); + aad_ = aad; + return this; + } + + Builder* aad_metadata(const std::string& aad_metadata) { + DCHECK(!aad_.empty()); + DCHECK(!aad_metadata.empty() && + aad_metadata.length() < MAXIMAL_AAD_METADATA_LENGTH); + aad_metadata_ = aad_metadata; + return this; + } + + /** + * encrypt_the_rest will define if other columns (not defined in columns argument) + * will be encrypted or not + * if encrypt_the_rest = true, other columns will be encrypted with footer key + * else, other columns will be unencrypted + */ + Builder* column_properties( + const std::map>& + column_properties, + bool encrypt_the_rest = DEFAULT_ENCRYPT_THE_REST) { + encrypt_the_rest_ = encrypt_the_rest; + column_properties_ = column_properties; + + if (!footer_key_.empty()) { + uniform_encryption_ = true; + + for (const auto& col : column_properties) { + if (col.second->key().compare(footer_key_) != 0) { + uniform_encryption_ = false; + break; + } + } + } else { + if (encrypt_the_rest) { + throw ParquetException("Encrypt the rest with null footer key"); + } + bool all_are_unencrypted = true; + for (const auto& col : column_properties) { + if (col.second->encrypted()) { + if (col.second->key().empty()) { + throw ParquetException("Encrypt column with null footer key"); + } + all_are_unencrypted = false; + } + } + + if (all_are_unencrypted) { + throw ParquetException("Footer and all columns unencrypted"); + } + } + return this; + } + + std::shared_ptr build() { + std::shared_ptr footer_encryption; + if (!footer_key_.empty()) { + footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); + } + return std::make_shared( + footer_encryption, footer_key_metadata_, aad_metadata_, uniform_encryption_, + column_properties_, encrypt_the_rest_); + } + + private: + Encryption::type algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_; + std::string aad_metadata_; + + bool uniform_encryption_; + + std::map> column_properties_; + bool encrypt_the_rest_; + }; + + FileEncryptionProperties( + const std::shared_ptr& footer_encryption, + const std::string& footer_key_metadata, const std::string& aad_metadata, + bool uniform_encryption, + const std::map>& + column_properties, + bool encrypt_the_rest) + : footer_encryption_(footer_encryption), + footer_key_metadata_(footer_key_metadata), + aad_metadata_(aad_metadata), + uniform_encryption_(uniform_encryption), + column_properties_(column_properties), + encrypt_the_rest_(encrypt_the_rest) {} + + std::shared_ptr GetFooterEncryptionProperties() { + return footer_encryption_; + } + + const std::string& footer_key_metadata() const { return footer_key_metadata_; } + + const std::string& aad_metadata() const { return aad_metadata_; } + + std::shared_ptr GetColumnCryptoMetaData( + const std::shared_ptr& path) { + // uniform encryption + if (uniform_encryption_) { + return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); + } + + // non-uniform encryption + std::string path_str = path->ToDotString(); + if (column_properties_.find(path_str) != column_properties_.end()) { + return column_properties_[path_str]; + } + + // encrypted with footer key + if (encrypt_the_rest_) { + return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); + } + + // unencrypted + return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); + } + + std::shared_ptr GetColumnEncryptionProperties( + const std::shared_ptr& path) { + // uniform encryption + if (uniform_encryption_) { + return footer_encryption_; + } + + // non-uniform encryption + std::string path_str = path->ToDotString(); + if (column_properties_.find(path_str) != column_properties_.end()) { + return std::make_shared(footer_encryption_->algorithm(), + column_properties_[path_str]->key(), + footer_encryption_->aad()); + } + + if (encrypt_the_rest_) { + return footer_encryption_; + } + + return NULLPTR; + } + + private: + std::shared_ptr footer_encryption_; + std::string footer_key_metadata_; + std::string aad_metadata_; + + bool uniform_encryption_; + + std::map> column_properties_; + bool encrypt_the_rest_; +}; + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -271,6 +594,12 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } + Builder* encryption( + const std::shared_ptr& file_encryption) { + file_encryption_ = file_encryption; + return this; + } + Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); return this; @@ -316,10 +645,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_), + default_column_properties_, column_properties)); } private: @@ -330,6 +659,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; + std::shared_ptr file_encryption_; // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -353,6 +683,18 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } + inline FileEncryptionProperties* file_encryption() const { + return parquet_file_encryption_.get(); + } + + inline std::shared_ptr footer_encryption() const { + if (parquet_file_encryption_ == NULLPTR) { + return NULLPTR; + } else { + return parquet_file_encryption_->GetFooterEncryptionProperties(); + } + } + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -396,11 +738,31 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } + std::shared_ptr column_encryption_props( + const std::shared_ptr& path) const { + if (parquet_file_encryption_) { + return parquet_file_encryption_->GetColumnCryptoMetaData(path); + } else { + return NULLPTR; + } + } + + std::shared_ptr encryption( + const std::shared_ptr& path) const { + if (parquet_file_encryption_) { + return parquet_file_encryption_->GetColumnEncryptionProperties(path); + } else { + return NULLPTR; + } + } + private: explicit WriterProperties( MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, - const std::string& created_by, const ColumnProperties& default_column_properties, + const std::string& created_by, + std::shared_ptr file_encryption, + const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), @@ -409,6 +771,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), + parquet_file_encryption_(file_encryption), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -419,6 +782,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + std::shared_ptr parquet_file_encryption_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index c7b62073df52..092a881e3eb2 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -44,6 +44,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/statistics.h" +#include "parquet/util/crypto.h" #include "parquet/parquet_types.h" // IYWU pragma: export @@ -81,6 +82,16 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } +static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + if (encryption.__isset.AES_GCM_V1) { + return EncryptionAlgorithm{Encryption::AES_GCM_V1, + encryption.AES_GCM_V1.aad_metadata}; + } else { + return EncryptionAlgorithm{Encryption::AES_GCM_CTR_V1, + encryption.AES_GCM_CTR_V1.aad_metadata}; + } +} + static inline format::Type::type ToThrift(Type::type type) { return static_cast(type); } @@ -131,6 +142,20 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { return statistics; } +static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { + format::EncryptionAlgorithm encryption_algorithm; + if (encryption.algorithm == Encryption::AES_GCM_V1) { + encryption_algorithm.__isset.AES_GCM_V1 = true; + encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); + encryption_algorithm.AES_GCM_V1.aad_metadata = encryption.aad_metadata; + } else { + encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; + encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); + encryption_algorithm.AES_GCM_CTR_V1.aad_metadata = encryption.aad_metadata; + } + return encryption_algorithm; +} + // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities @@ -140,22 +165,48 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new ThriftBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; - shared_ptr tproto = // - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + const EncryptionProperties* encryption = NULLPTR) { + if (encryption == NULLPTR) { + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; + } else { + // first 4 bytes for length + uint8_t clenBytes[4]; + memcpy(clenBytes, buf, 4); + + uint32_t clen = *(reinterpret_cast(clenBytes)); + + // decrypt + std::vector decrypted_buffer(encryption->CalculatePlainSize(clen)); + + uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( + encryption->algorithm(), true, &buf[4], clen, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + decrypted_buffer.data()); + + if (decrypted_buffer_len <= 0) { + throw ParquetException("Couldn't decrypt buffer\n"); + } + + DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, + deserialized_msg); + + *len = 4 + clen; } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; } /// Utility class to serialize thrift objects to a binary format. This object @@ -186,12 +237,26 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); - return static_cast(out_length); + + if (encryption == NULLPTR) { + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); + } else { + std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); + int cipher_buffer_len = parquet_encryption::Encrypt( + encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + cipher_buffer.data()); + + PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); + + return static_cast(cipher_buffer_len + 4); + } } private: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 30395f37ec40..4e5b344e1565 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -468,6 +468,64 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct EncryptionAlgorithm { + Encryption::type algorithm; + std::string aad_metadata; +}; + +class PARQUET_EXPORT EncryptionProperties { + private: + static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); + } + + public: + EncryptionProperties() = default; + EncryptionProperties(Encryption::type algorithm, const std::string& key, + const std::string& aad = "") + : algorithm_(algorithm), key_(key), aad_(aad) {} + + ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } + + int key_length() const { return static_cast(key_.length()); } + uint8_t* key_bytes() const { return str2bytes(key_); } + + void aad(const std::string& aad) { aad_ = aad; } + int aad_length() const { return static_cast(aad_.length()); } + uint8_t* aad_bytes() const { return str2bytes(aad_); } + + Encryption::type algorithm() const { return algorithm_; } + + const std::string& key() const { return key_; } + const std::string& aad() const { return aad_; } + + uint32_t CalculateCipherSize(uint32_t plain_len) const { + if (algorithm_ == Encryption::AES_GCM_V1) { + return plain_len + 28; + } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + return plain_len + 16; + } + return plain_len; + } + + uint32_t CalculatePlainSize(uint32_t cipher_len) const { + if (algorithm_ == Encryption::AES_GCM_V1) { + return cipher_len - 28; + } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + return cipher_len - 16; + } + return cipher_len; + } + + private: + Encryption::type algorithm_; // encryption algorithm + std::string key_; // encryption key, should have 16, 24, 32-byte length + std::string aad_; // encryption additional authenticated data +}; + // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; From ab65b1db99ed89cbeded3fd38aa94cb1cc24d706 Mon Sep 17 00:00:00 2001 From: Tham Ha Thi Date: Thu, 24 Jan 2019 10:30:32 +0700 Subject: [PATCH 002/201] update thrift change and update encrypted footer --- cpp/src/parquet/file_reader.cc | 76 ++++++++++++++------------- cpp/src/parquet/file_writer.cc | 27 +++------- cpp/src/parquet/metadata.cc | 76 ++++++++++++--------------- cpp/src/parquet/metadata.h | 12 ++--- cpp/src/parquet/thrift.h | 94 ++++++++++++++++++++++++---------- cpp/src/parquet/types.h | 16 ++++-- 6 files changed, 163 insertions(+), 138 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 970fbdf31d61..497e9758596a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -129,7 +129,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if (!file_crypto_metadata_ || !crypto_meta_data) { + if (!file_crypto_metadata_ || !crypto_metadata) { encrypted = false; } @@ -143,8 +143,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto file_decryption = properties_.file_decryption(); // the column is encrypted with footer key - if (crypto_meta_data->encrypted_with_footer_key()) { - std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); + if (crypto_metadata->encrypted_with_footer_key()) { + std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); if (footer_key.empty()) { @@ -161,7 +161,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is non-uniform encrypted and the column is encrypted with its own key - std::string column_key_metadata = crypto_meta_data->column_key_metadata(); + std::string column_key_metadata = crypto_metadata->key_metadata(); // encrypted with column key std::string column_key = file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); @@ -238,7 +238,8 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Corrupt footer."); } - // no encryption + // no encryption or encryption with plaintext footer + // TODO: encryption with plaintext footer if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { uint32_t metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - @@ -264,58 +265,61 @@ class SerializedFile : public ParquetFileReader::Contents { } file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } - // encryption + // encryption with encrypted footer else { - // read crypto metadata - uint32_t crypto_metadata_len = arrow::util::SafeLoadAs( + // both metadata & crypto metadata length + uint32_t footer_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - kFooterSize); - int64_t crypto_metadata_start = file_size - kFooterSize - crypto_metadata_len; + int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; - if (kFooterSize + crypto_metadata_len > file_size) { + if (kFooterSize + footer_len > file_size) { throw ParquetException( "Invalid parquet file. File is less than " "file metadata size."); } - std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (crypto_metadata_len + kFooterSize)) { + if (footer_read_size >= (footer_len + kFooterSize)) { crypto_metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - crypto_metadata_len - kFooterSize, crypto_metadata_len); + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); } else { PARQUET_THROW_NOT_OK( - source_->ReadAt(crypto_metadata_start, crypto_metadata_len, &crypto_metadata_buffer)); - if (crypto_metadata_buffer->size() != crypto_metadata_len) { + source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != footer_len) { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } + uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - - int64_t footer_offset = file_crypto_metadata_->footer_offset(); - uint32_t footer_read_size = (uint32_t)(crypto_metadata_start - footer_offset); - - std::shared_ptr metadata_buffer = - SliceBuffer(footer_buffer, footer_offset, footer_read_size); + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - if (file_crypto_metadata_->encrypted_footer()) { - // get footer key metadata - std::string footer_key_metadata = file_crypto_metadata_->footer_key_metadata(); - - auto file_decryption = properties_.file_decryption(); - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); - - auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->GetAad()); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + std::shared_ptr metadata_buffer; + PARQUET_THROW_NOT_OK( + source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException("Invalid encrypted parquet file. Could not read footer metadata bytes."); + } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size, - footer_encryption); - } else { - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &footer_read_size); + // get footer key metadata + std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); + auto file_decryption = properties_.file_decryption(); + if (file_decryption == nullptr) { + throw ParquetException("No decryption properties are provided. Could not read encrypted footer metadata"); + } + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + if (footer_key.size() == 0) { + throw ParquetException("Invalid footer encryption key. Could not parse footer metadata"); } + auto footer_encryption = std::make_shared( + file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, + file_decryption->GetAad()); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, + footer_encryption); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 64716400070a..7d8633cb4d6a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -274,13 +274,16 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*file_metadata_, sink_.get()); } else { uint64_t metadata_start = static_cast(sink_->Tell()); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + file_encryption->GetFooterEncryptionProperties(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption.get()); + uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - auto crypto_metadata = metadata_->GetCryptoMetaData(metadata_start); - WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + sink_->Write(PARQUET_EMAGIC, 4); } sink_->Close(); @@ -395,7 +398,6 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin uint32_t metadata_len = static_cast(position); file_metadata.WriteTo(sink); - PARQUET_THROW_NOT_OK(sink->Tell(&position)); metadata_len = static_cast(position) - metadata_len; @@ -413,25 +415,10 @@ void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { return WriteFileMetaData(file_metadata, &wrapper); } -void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, - ArrowOutputStream* sink) { - int64_t position = -1; - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - uint64_t crypto_offset = static_cast(position); - - // Get a FileCryptoMetaData - crypto_metadata.WriteTo(sink); - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - auto crypto_len = static_cast(position) - crypto_offset; - PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&crypto_len), 4)); - - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_EMAGIC, 4)); -} - void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink) { ParquetOutputWrapper wrapper(sink); - return WriteFileCryptoMetaData(crypto_metadata, &wrapper); + crypto_metadata.WriteTo(sink); } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index d48fd36bf6b1..90b288ffeb04 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -86,7 +86,7 @@ static std::shared_ptr MakeTypedColumnStats( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); + metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); } // Default behavior return MakeStatistics( @@ -139,8 +139,8 @@ class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { const std::vector& path_in_schema() const { return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema; } - const std::string& column_key_metadata() const { - return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.column_key_metadata; + const std::string& key_metadata() const { + return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata; } private: @@ -164,8 +164,8 @@ const std::vector& ColumnCryptoMetaData::path_in_schema() const { bool ColumnCryptoMetaData::encrypted_with_footer_key() const { return impl_->encrypted_with_footer_key(); } -const std::string& ColumnCryptoMetaData::column_key_metadata() const { - return impl_->column_key_metadata(); +const std::string& ColumnCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); } // ColumnChunk metadata @@ -250,10 +250,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return column_->meta_data.total_uncompressed_size; } - inline std::unique_ptr crypto_meta_data() const { - if (column_->__isset.crypto_meta_data) { + inline std::unique_ptr crypto_metadata() const { + if (column_->__isset.crypto_metadata) { return ColumnCryptoMetaData::Make( - reinterpret_cast(&column_->crypto_meta_data)); + reinterpret_cast(&column_->crypto_metadata)); } else { return nullptr; } @@ -336,8 +336,8 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } -std::unique_ptr ColumnChunkMetaData::crypto_meta_data() const { - return impl_->crypto_meta_data(); +std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { + return impl_->crypto_metadata(); } // row-group metadata @@ -409,7 +409,7 @@ class FileMetaData::FileMetaDataImpl { : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), encryption.get()); + metadata_.get(), encryption.get(), false); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -439,7 +439,7 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst, encryption); + serializer.Serialize(metadata_.get(), dst, encryption, false); } std::unique_ptr RowGroup(int i) { @@ -605,13 +605,7 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } - - bool encrypted_footer() { return metadata_->encrypted_footer; } - - const std::string& footer_key_metadata() { return metadata_->footer_key_metadata; } - - uint64_t footer_offset() { return metadata_->footer_offset; } - + const std::string& key_metadata() { return metadata_->key_metadata; } void WriteTo(::arrow::io::OutputStream* dst) const { ThriftSerializer serializer; serializer.Serialize(metadata_.get(), dst); @@ -623,14 +617,13 @@ class FileCryptoMetaData::FileCryptoMetaDataImpl { uint32_t metadata_len_; }; -EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() { +EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const { return impl_->encryption_algorithm(); } -bool FileCryptoMetaData::encrypted_footer() { return impl_->encrypted_footer(); } -const std::string& FileCryptoMetaData::footer_key_metadata() { - return impl_->footer_key_metadata(); + +const std::string& FileCryptoMetaData::key_metadata() const { + return impl_->key_metadata(); } -uint64_t FileCryptoMetaData::footer_offset() { return impl_->footer_offset(); } std::shared_ptr FileCryptoMetaData::Make( const uint8_t* serialized_metadata, uint32_t* metadata_len) { @@ -831,7 +824,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { serializer.Serialize(column_chunk_, sink); } else { // column is encrypted - column_chunk_->__isset.crypto_meta_data = true; + column_chunk_->__isset.crypto_metadata = true; // encrypted with footer key format::ColumnCryptoMetaData ccmd; @@ -840,12 +833,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; - eck.__set_column_key_metadata(encrypt_md->key_metadata()); + eck.__set_key_metadata(encrypt_md->key_metadata()); eck.__set_path_in_schema(column_->path()->ToDotVector()); ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk_->__set_crypto_meta_data(ccmd); + column_chunk_->__set_crypto_metadata(ccmd); auto footer_encryption = properties_->footer_encryption(); @@ -1125,7 +1118,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } - std::unique_ptr BuildFileCryptoMetaData(uint64_t footerOffset) { + std::unique_ptr BuildFileCryptoMetaData() { if (crypto_metadata_ == nullptr) { return nullptr; } @@ -1136,23 +1129,19 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { // build format::FileCryptoMetaData EncryptionAlgorithm encryption_algorithm; encryption_algorithm.algorithm = footer_encryption->algorithm(); - encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); + // TODO: aad metadata + //encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); - crypto_metadata_->__set_encrypted_footer(!footer_encryption->key().empty()); - - std::string footer_key_metadata = file_encryption->footer_key_metadata(); - if (!footer_key_metadata.empty()) { - crypto_metadata_->__set_footer_key_metadata(footer_key_metadata); + std::string key_metadata = file_encryption->footer_key_metadata(); + if (!key_metadata.empty()) { + crypto_metadata_->__set_key_metadata(key_metadata); } - crypto_metadata_->__set_footer_offset(footerOffset); - // TODO set iv_prefix??? - - // return as FileCryptoMetaData - std::unique_ptr file_crypto_meta_data = + std::unique_ptr file_crypto_metadata = std::unique_ptr(new FileCryptoMetaData()); - file_crypto_meta_data->impl_->metadata_ = std::move(crypto_metadata_); - return file_crypto_meta_data; + file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_); + + return file_crypto_metadata; } protected: @@ -1189,9 +1178,8 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } -std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData( - uint64_t footerOffset) { - return impl_->BuildFileCryptoMetaData(footerOffset); +std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { + return impl_->BuildFileCryptoMetaData(); } } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 2ec7e5b17f75..a961f624cb66 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -104,7 +104,7 @@ class PARQUET_EXPORT ColumnCryptoMetaData { const std::vector& path_in_schema() const; bool encrypted_with_footer_key() const; - const std::string& column_key_metadata() const; + const std::string& key_metadata() const; private: explicit ColumnCryptoMetaData(const uint8_t* metadata); @@ -143,7 +143,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; - std::unique_ptr crypto_meta_data() const; + std::unique_ptr crypto_metadata() const; private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, @@ -235,10 +235,8 @@ class PARQUET_EXPORT FileCryptoMetaData { uint32_t* metadata_len); ~FileCryptoMetaData(); - EncryptionAlgorithm encryption_algorithm(); - bool encrypted_footer(); - const std::string& footer_key_metadata(); - uint64_t footer_offset(); + EncryptionAlgorithm encryption_algorithm() const; + const std::string& key_metadata() const; void WriteTo(::arrow::io::OutputStream* dst) const; @@ -337,7 +335,7 @@ class PARQUET_EXPORT FileMetaDataBuilder { std::unique_ptr Finish(); // crypto metadata - std::unique_ptr GetCryptoMetaData(uint64_t footerOffset); + std::unique_ptr GetCryptoMetaData(); private: explicit FileMetaDataBuilder( diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 092a881e3eb2..8f2e58ebb29e 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -82,14 +82,34 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) return static_cast(type); } +static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) { + return AadMetadata { + aesGcmV1.aad_prefix, + aesGcmV1.aad_file_unique, + aesGcmV1.supply_aad_prefix + }; +} + +static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { + return AadMetadata { + aesGcmCtrV1.aad_prefix, + aesGcmCtrV1.aad_file_unique, + aesGcmCtrV1.supply_aad_prefix + }; +} + static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { + EncryptionAlgorithm encryption_algorithm; + if (encryption.__isset.AES_GCM_V1) { - return EncryptionAlgorithm{Encryption::AES_GCM_V1, - encryption.AES_GCM_V1.aad_metadata}; + encryption_algorithm.algorithm = Encryption::AES_GCM_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); + } else { - return EncryptionAlgorithm{Encryption::AES_GCM_CTR_V1, - encryption.AES_GCM_CTR_V1.aad_metadata}; + encryption_algorithm.algorithm = Encryption::AES_GCM_CTR_V1; + encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); } + return encryption_algorithm; } static inline format::Type::type ToThrift(Type::type type) { @@ -142,16 +162,30 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { return statistics; } +static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { + format::AesGcmV1 aesGcmV1; + aesGcmV1.aad_prefix = aad.aad_prefix; + aesGcmV1.aad_file_unique = aad.aad_file_unique; + aesGcmV1.supply_aad_prefix = aad.supply_aad_prefix; + return aesGcmV1; +} + +static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { + format::AesGcmCtrV1 aesGcmCtrV1; + aesGcmCtrV1.aad_prefix = aad.aad_prefix; + aesGcmCtrV1.aad_file_unique = aad.aad_file_unique; + aesGcmCtrV1.supply_aad_prefix = aad.supply_aad_prefix; + return aesGcmCtrV1; +} + static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; if (encryption.algorithm == Encryption::AES_GCM_V1) { encryption_algorithm.__isset.AES_GCM_V1 = true; - encryption_algorithm.AES_GCM_V1 = format::AesGcmV1(); - encryption_algorithm.AES_GCM_V1.aad_metadata = encryption.aad_metadata; + encryption_algorithm.AES_GCM_V1 = ToAesGcmV1Thrift(encryption.aad); } else { encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; - encryption_algorithm.AES_GCM_CTR_V1 = format::AesGcmCtrV1(); - encryption_algorithm.AES_GCM_CTR_V1.aad_metadata = encryption.aad_metadata; + encryption_algorithm.AES_GCM_CTR_V1 = ToAesGcmCtrV1Thrift(encryption.aad); } return encryption_algorithm; } @@ -166,7 +200,8 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const EncryptionProperties* encryption = NULLPTR) { + const EncryptionProperties* encryption = NULLPTR, + bool shouldReadLength = true) { if (encryption == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( @@ -184,27 +219,28 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali uint32_t bytes_left = tmem_transport->available_read(); *len = *len - bytes_left; } else { - // first 4 bytes for length - uint8_t clenBytes[4]; - memcpy(clenBytes, buf, 4); - - uint32_t clen = *(reinterpret_cast(clenBytes)); - + uint32_t clen; + if (shouldReadLength) { + // first 4 bytes for length + uint8_t clenBytes[4]; + memcpy(clenBytes, buf, 4); + clen = *(reinterpret_cast(clenBytes)); + } + else { + clen = *len; + } // decrypt - std::vector decrypted_buffer(encryption->CalculatePlainSize(clen)); - + const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; + std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption->algorithm(), true, &buf[4], clen, encryption->key_bytes(), + encryption->algorithm(), true, cipherBuf, clen, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), decrypted_buffer.data()); - if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } - DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - *len = 4 + clen; } } @@ -237,7 +273,8 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR, + bool shouldWriteLength = true) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -252,10 +289,15 @@ class ThriftSerializer { encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), cipher_buffer.data()); - PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); - PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); - - return static_cast(cipher_buffer_len + 4); + if (shouldWriteLength) { + PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len + 4); + } + else { + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len); + } } } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 4e5b344e1565..d8da74505321 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -468,9 +468,15 @@ struct Encryption { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; +struct AadMetadata { + std::string aad_prefix; + std::string aad_file_unique; + bool supply_aad_prefix; +}; + struct EncryptionAlgorithm { Encryption::type algorithm; - std::string aad_metadata; + AadMetadata aad; }; class PARQUET_EXPORT EncryptionProperties { @@ -502,8 +508,8 @@ class PARQUET_EXPORT EncryptionProperties { const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } - uint32_t CalculateCipherSize(uint32_t plain_len) const { - if (algorithm_ == Encryption::AES_GCM_V1) { + uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { + if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { return plain_len + 28; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { return plain_len + 16; @@ -511,8 +517,8 @@ class PARQUET_EXPORT EncryptionProperties { return plain_len; } - uint32_t CalculatePlainSize(uint32_t cipher_len) const { - if (algorithm_ == Encryption::AES_GCM_V1) { + uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { + if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { return cipher_len - 28; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { return cipher_len - 16; From 0ad09da91f77a205f3c00fa7da01c0cd2a70e4a7 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 28 Jan 2019 17:12:39 +0700 Subject: [PATCH 003/201] add encryption source files into CMakeLists.txt --- cpp/src/parquet/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 7cc05e3066e3..e5ecee4a8d27 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -167,6 +167,7 @@ set(PARQUET_SRCS column_writer.cc deprecated_io.cc encoding.cc + encryption.cc file_reader.cc file_writer.cc metadata.cc @@ -178,7 +179,8 @@ set(PARQUET_SRCS properties.cc schema.cc statistics.cc - types.cc) + types.cc + util/crypto.cc) # Ensure that thrift compilation is done before using its generated headers # in parquet code. From 5c295c2fa2b2e4f1e87c59c600c71fcfdfbf27f3 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 29 Jan 2019 18:30:31 +0700 Subject: [PATCH 004/201] add example from old PR of parquet-cpp --- cpp/examples/parquet/CMakeLists.txt | 4 + .../low-level-api/encryption-reader-writer.cc | 436 ++++++++++++++++++ 2 files changed, 440 insertions(+) create mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer.cc diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index f2722b1cbf85..2a2421c18ab9 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -17,10 +17,13 @@ add_executable(parquet-low-level-example low-level-api/reader-writer.cc) add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) +add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) +target_include_directories(parquet-encryption-example PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) +target_link_libraries(parquet-encryption-example parquet_static) add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated @@ -33,4 +36,5 @@ endif() add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 + parquet-encryption-example parquet-arrow-example) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc new file mode 100644 index 000000000000..0f8e9ab3e832 --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -0,0 +1,436 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +/* + * This example describes writing and reading Parquet Files in C++ and serves as a + * reference to the API. + * The file contains all the physical data types supported by Parquet. + * This example uses the RowGroupWriter API that supports writing RowGroups optimized for + *memory consumption + **/ + +/* Parquet is a structured columnar file format + * Parquet File = "Parquet data" + "Parquet Metadata" + * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a + * columnar layout + * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their + * Columns + * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a + * complex (nested) type (internal nodes) + * For specific details, please refer the format here: + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + **/ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; +const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; +const std::string FOOTER_ENCRYPTION_KEY = "0123456789012345"; // 16 bytes +const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes + +int main(int argc, char** argv) { + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // parquet::REQUIRED fields do not need definition and repetition level values + // parquet::OPTIONAL fields require only definition level values + // parquet::REPEATED fields require both definition and repetition level values + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // uniform encryption + parquet::FileEncryptionProperties::Builder file_encryption_builder; + file_encryption_builder.footer_key(FOOTER_ENCRYPTION_KEY); + + // non-uniform with column keys + std::map> encryption_cols; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("column_0", true); + encryption_col_builder0.key(COLUMN_ENCRYPTION_KEY); + auto encryption_col0 = encryption_col_builder0.build(); + + encryption_cols[encryption_col0->path()] = encryption_col0; + + file_encryption_builder.column_properties(encryption_cols, true); + + builder.encryption(file_encryption_builder.build()); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + try { + // decryption properties + std::shared_ptr decryption_properties = + std::make_shared(FOOTER_ENCRYPTION_KEY); + decryption_properties->SetColumnKey("column_0", COLUMN_ENCRYPTION_KEY); + + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption(decryption_properties); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; + return -1; + } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; + + return 0; +} From 138b89681bd29f40827b27eaff706829cafac774 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 31 Jan 2019 17:26:33 +0700 Subject: [PATCH 005/201] change due to new update in crypto package --- cpp/src/parquet/thrift.h | 13 +++++++++---- cpp/src/parquet/types.h | 8 ++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 8f2e58ebb29e..b57c82e4773c 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -201,7 +201,7 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const EncryptionProperties* encryption = NULLPTR, - bool shouldReadLength = true) { + bool shouldReadLength = false) { if (encryption == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( @@ -233,7 +233,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption->algorithm(), true, cipherBuf, clen, encryption->key_bytes(), + encryption->algorithm(), true, cipherBuf, 0, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { @@ -241,7 +241,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - *len = 4 + clen; + *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); } } @@ -274,7 +274,7 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR, - bool shouldWriteLength = true) { + bool shouldWriteLength = false) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -288,6 +288,11 @@ class ThriftSerializer { encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), cipher_buffer.data()); + if (cipher_buffer_len > cipher_buffer.size()) { + std::stringstream ss; + ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; + throw ParquetException(ss.str()); + } if (shouldWriteLength) { PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index d8da74505321..270e0b62cf8e 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -510,18 +510,18 @@ class PARQUET_EXPORT EncryptionProperties { uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { - return plain_len + 28; + return plain_len + 28 + 4; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { - return plain_len + 16; + return plain_len + 16 + 4; } return plain_len; } uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { - return cipher_len - 28; + return cipher_len - 28 - 4; } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { - return cipher_len - 16; + return cipher_len - 16 - 4; } return cipher_len; } From 31cf8714a2a27fca40492ca9f7055013cf49682a Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 1 Feb 2019 15:20:48 +0700 Subject: [PATCH 006/201] pass EncryptionProperties into parquet_encryption::Encrypt()/Decrypt() instead of pass it as many params --- cpp/src/parquet/column_reader.cc | 6 +++--- cpp/src/parquet/column_reader.h | 2 +- cpp/src/parquet/column_writer.cc | 4 ++-- cpp/src/parquet/file_writer.cc | 4 ++-- cpp/src/parquet/file_writer.h | 2 +- cpp/src/parquet/metadata.cc | 14 +++++++------- cpp/src/parquet/metadata.h | 6 +++--- cpp/src/parquet/thrift.h | 13 +++++-------- 8 files changed, 24 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e47aa1e9c6f8..c240dec8f165 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -116,7 +116,7 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - std::shared_ptr encryption, + const std::shared_ptr encryption, ::arrow::MemoryPool* pool) : : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), @@ -179,7 +179,7 @@ std::shared_ptr SerializedPageReader::NextPage() { try { DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, - encryption_.get()); + encryption_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -285,7 +285,7 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, std::shared_ptr encryption, + Compression::type codec, const std::shared_ptr encryption, ::arrow::MemoryPool* pool) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, encryption, pool)); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 47340368929f..17c5ad6524bb 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -80,7 +80,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, std::shared_ptr encryption = NULLPTR, + Compression::type codec, const std::shared_ptr& encryption = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 34a3d63b6d57..6e359bb49795 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -196,7 +196,7 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -276,7 +276,7 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_.get()); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 7d8633cb4d6a..d9c84bb25a60 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -279,7 +279,7 @@ class FileSerializer : public ParquetFileWriter::Contents { std::shared_ptr footer_encryption = file_encryption->GetFooterEncryptionProperties(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption.get()); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); @@ -390,7 +390,7 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - EncryptionProperties* footer_encryption) { + const std::shared_ptr& footer_encryption) { if (footer_encryption == nullptr) { // Write MetaData int64_t position = -1; diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index cf3cd1312511..febcaec410e1 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -95,7 +95,7 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - EncryptionProperties* encryption_properties = NULLPTR); + const std::shared_ptr& encryption_properties = NULLPTR); void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 90b288ffeb04..885135db6a03 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -405,11 +405,11 @@ class FileMetaData::FileMetaDataImpl { FileMetaDataImpl() : metadata_len_(0) {} explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, - std::shared_ptr encryption = nullptr) + const std::shared_ptr& encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), encryption.get(), false); + metadata_.get(), encryption, false); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -437,7 +437,7 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { ThriftSerializer serializer; serializer.Serialize(metadata_.get(), dst, encryption, false); } @@ -525,14 +525,14 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr FileMetaData::Make(const void* metadata, uint32_t* metadata_len, - std::shared_ptr encryption) { + const std::shared_ptr& encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( new FileMetaData(metadata, metadata_len, encryption)); } FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, - std::shared_ptr encryption) + const std::shared_ptr& encryption) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len, encryption))} {} @@ -586,7 +586,7 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption) const { +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { return impl_->WriteTo(dst, encryption); } @@ -855,7 +855,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { auto encrypt_props = properties_->encryption(column_->path()); uint64_t metadata_start = sink->Tell(); - serializer.Serialize(&column_metadata_, sink, encrypt_props.get()); + serializer.Serialize(&column_metadata_, sink, encrypt_props); // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. column_chunk_->__set_file_offset(metadata_start); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index a961f624cb66..d1f1f936a6ee 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -185,7 +185,7 @@ class PARQUET_EXPORT FileMetaData { // API convenience to get a MetaData accessor static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len, - std::shared_ptr encryption = NULLPTR); + const std::shared_ptr& encryption = NULLPTR); ~FileMetaData(); @@ -204,7 +204,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst, EncryptionProperties* encryption = NULLPTR) const; + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -220,7 +220,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, - std::shared_ptr encryption = NULLPTR); + const std::shared_ptr& encryption = NULLPTR); // PIMPL Idiom FileMetaData(); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index b57c82e4773c..53f9b84c3a6e 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -200,7 +200,7 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const EncryptionProperties* encryption = NULLPTR, + const std::shared_ptr& encryption = NULLPTR, bool shouldReadLength = false) { if (encryption == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. @@ -233,9 +233,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption->algorithm(), true, cipherBuf, 0, encryption->key_bytes(), - encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), - decrypted_buffer.data()); + encryption, true, cipherBuf, 0, decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } @@ -273,7 +271,8 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out, const EncryptionProperties* encryption = NULLPTR, + int64_t Serialize(const T* obj, ArrowOutputStream* out, + const std::shared_ptr& encryption = NULLPTR, bool shouldWriteLength = false) { uint8_t* out_buffer; uint32_t out_length; @@ -285,9 +284,7 @@ class ThriftSerializer { } else { std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); int cipher_buffer_len = parquet_encryption::Encrypt( - encryption->algorithm(), true, out_buffer, out_length, encryption->key_bytes(), - encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), - cipher_buffer.data()); + encryption, true, out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > cipher_buffer.size()) { std::stringstream ss; ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; From 3ebebb597aeca26a13f012eeab37afe826947acb Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 09:57:16 +0700 Subject: [PATCH 007/201] fix issue of wrong column name in encryption-example and remove FileEncryptionProperties::uniform_encryption_ --- .../low-level-api/encryption-reader-writer.cc | 4 +-- cpp/src/parquet/properties.h | 30 +++++-------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 0f8e9ab3e832..42f9b45e24f1 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -73,7 +73,7 @@ int main(int argc, char** argv) { // non-uniform with column keys std::map> encryption_cols; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("column_0", true); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("ba_field", true); encryption_col_builder0.key(COLUMN_ENCRYPTION_KEY); auto encryption_col0 = encryption_col_builder0.build(); @@ -199,7 +199,7 @@ int main(int argc, char** argv) { // decryption properties std::shared_ptr decryption_properties = std::make_shared(FOOTER_ENCRYPTION_KEY); - decryption_properties->SetColumnKey("column_0", COLUMN_ENCRYPTION_KEY); + decryption_properties->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); reader_properties.file_decryption(decryption_properties); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a54162bb55ea..624ab65bf9c1 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -52,6 +52,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { Builder* key(const std::string& key) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); DCHECK(encrypt_); + encrypted_with_footer_key_ = false; key_ = key; return this; @@ -269,10 +270,10 @@ class PARQUET_EXPORT FileEncryptionProperties { public: class Builder { public: - Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) {} + Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) {} Builder(const std::string& key) - : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM), uniform_encryption_(true) { + : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); footer_key_ = key; } @@ -324,11 +325,8 @@ class PARQUET_EXPORT FileEncryptionProperties { column_properties_ = column_properties; if (!footer_key_.empty()) { - uniform_encryption_ = true; - for (const auto& col : column_properties) { if (col.second->key().compare(footer_key_) != 0) { - uniform_encryption_ = false; break; } } @@ -359,7 +357,7 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); } return std::make_shared( - footer_encryption, footer_key_metadata_, aad_metadata_, uniform_encryption_, + footer_encryption, footer_key_metadata_, aad_metadata_, column_properties_, encrypt_the_rest_); } @@ -371,8 +369,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_; std::string aad_metadata_; - bool uniform_encryption_; - std::map> column_properties_; bool encrypt_the_rest_; }; @@ -380,14 +376,12 @@ class PARQUET_EXPORT FileEncryptionProperties { FileEncryptionProperties( const std::shared_ptr& footer_encryption, const std::string& footer_key_metadata, const std::string& aad_metadata, - bool uniform_encryption, const std::map>& column_properties, bool encrypt_the_rest) : footer_encryption_(footer_encryption), footer_key_metadata_(footer_key_metadata), aad_metadata_(aad_metadata), - uniform_encryption_(uniform_encryption), column_properties_(column_properties), encrypt_the_rest_(encrypt_the_rest) {} @@ -401,11 +395,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::shared_ptr GetColumnCryptoMetaData( const std::shared_ptr& path) { - // uniform encryption - if (uniform_encryption_) { - return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); - } - // non-uniform encryption std::string path_str = path->ToDotString(); if (column_properties_.find(path_str) != column_properties_.end()) { @@ -417,17 +406,12 @@ class PARQUET_EXPORT FileEncryptionProperties { return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); } - // unencrypted + // uniform encryption or unencrypted return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); } std::shared_ptr GetColumnEncryptionProperties( const std::shared_ptr& path) { - // uniform encryption - if (uniform_encryption_) { - return footer_encryption_; - } - // non-uniform encryption std::string path_str = path->ToDotString(); if (column_properties_.find(path_str) != column_properties_.end()) { @@ -436,10 +420,12 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_encryption_->aad()); } + // encrypted with footer key if (encrypt_the_rest_) { return footer_encryption_; } + // uniform encryption or unencrypted return NULLPTR; } @@ -448,8 +434,6 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key_metadata_; std::string aad_metadata_; - bool uniform_encryption_; - std::map> column_properties_; bool encrypt_the_rest_; }; From 189b2f28ec5d49273c71d202ec79e964ba52e956 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 09:58:20 +0700 Subject: [PATCH 008/201] get column path from ColumnCryptoMetadata when column is encrypted with its own key --- cpp/src/parquet/file_reader.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 497e9758596a..0d7eb066b439 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -162,13 +162,15 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is non-uniform encrypted and the column is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); + std::shared_ptr column_path = + std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key std::string column_key = - file_decryption->GetColumnKey(col->path_in_schema(), column_key_metadata); + file_decryption->GetColumnKey(column_path, column_key_metadata); if (column_key.empty()) { throw ParquetException("column is encrypted with null key, path=" + - col->path_in_schema()->ToDotString()); + column_path->ToDotString()); } auto column_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, column_key, From 8dcf50cc6cf66f6cfbc89a73f01556da7e0df279 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 18:11:52 +0700 Subject: [PATCH 009/201] let encryption examples to be able to cover more cases --- .../low-level-api/encryption-reader-writer.cc | 728 +++++++++--------- 1 file changed, 371 insertions(+), 357 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 42f9b45e24f1..b762f46ddfb1 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -48,389 +48,403 @@ const std::string FOOTER_ENCRYPTION_KEY = "0123456789012345"; // 16 bytes const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes int main(int argc, char** argv) { - /********************************************************************************** - PARQUET WRITER EXAMPLE - **********************************************************************************/ - // parquet::REQUIRED fields do not need definition and repetition level values - // parquet::OPTIONAL fields require only definition level values - // parquet::REPEATED fields require both definition and repetition level values - try { - // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); - - // Setup the parquet schema - std::shared_ptr schema = SetupSchema(); - - // Add writer properties - parquet::WriterProperties::Builder builder; - builder.compression(parquet::Compression::SNAPPY); - - // uniform encryption - parquet::FileEncryptionProperties::Builder file_encryption_builder; - file_encryption_builder.footer_key(FOOTER_ENCRYPTION_KEY); - - // non-uniform with column keys - std::map> encryption_cols; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder0("ba_field", true); - encryption_col_builder0.key(COLUMN_ENCRYPTION_KEY); - auto encryption_col0 = encryption_col_builder0.build(); - - encryption_cols[encryption_col0->path()] = encryption_col0; - - file_encryption_builder.column_properties(encryption_cols, true); - - builder.encryption(file_encryption_builder.build()); - - std::shared_ptr props = builder.build(); - - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - // Append a RowGroup with a specific number of rows. - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } + std::vector> file_encryption_properties; + std::vector> file_decryption_properties; + + // uniform encryption + parquet::FileEncryptionProperties::Builder file_encryption_builder_1; + file_encryption_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); + + std::shared_ptr decryption_properties_1 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + + // non-uniform with column keys + std::map> encryption_cols; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0("ba_field", true); + encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); + auto encryption_col0 = encryption_col_builder_0.build(); + encryption_cols[encryption_col0->path()] = encryption_col0; + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2; + file_encryption_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); + file_encryption_builder_2.column_properties(encryption_cols, true); + + std::shared_ptr decryption_properties_2 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); + + file_encryption_properties.push_back(file_encryption_builder_1.build()); + file_encryption_properties.push_back(file_encryption_builder_2.build()); + + file_decryption_properties.push_back(decryption_properties_1); + file_decryption_properties.push_back(decryption_properties_2); + + for (int i = 0; i < file_encryption_properties.size(); ++i) { + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // parquet::REQUIRED fields do not need definition and repetition level values + // parquet::OPTIONAL fields require only definition level values + // parquet::REPEATED fields require both definition and repetition level values + // setup for encryption + try { + + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + builder.encryption(file_encryption_properties[i]); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::ByteArray value; - char hello[FIXED_LENGTH] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = FIXED_LENGTH; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); } - } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); - // Close the ParquetFileWriter - file_writer->Close(); + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the bytes to file - DCHECK(out_file->Close().ok()); - } catch (const std::exception& e) { - std::cerr << "Parquet write error: " << e.what() << std::endl; - return -1; - } + // Close the ParquetFileWriter + file_writer->Close(); - /********************************************************************************** - PARQUET READER EXAMPLE - **********************************************************************************/ - - try { - // decryption properties - std::shared_ptr decryption_properties = - std::make_shared(FOOTER_ENCRYPTION_KEY); - decryption_properties->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); - - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption(decryption_properties); - - // Create a ParquetReader instance - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - assert(num_row_groups == 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - assert(value == expected_value); - i++; - } + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - assert(value == i); - i++; - } + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption(file_decryption_properties[i]); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - assert(value == expected_value); - if ((i % 2) == 0) { - assert(repetition_level == 1); - } else { - assert(repetition_level == 0); + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; } - i++; - } - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; } - i++; - } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - assert(value == expected_value); - i++; - } + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - double expected_value = i * 1.1111111; - assert(value == expected_value); - i++; - } + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // Verify the value written - char expected_value[FIXED_LENGTH] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); // There are no NULL values in the rows written assert(values_read == 1); - assert(value.len == FIXED_LENGTH); - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - assert(definition_level == 1); - } else { - // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; } - i++; - } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - char v = static_cast(i); - char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - i++; + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; } - } catch (const std::exception& e) { - std::cerr << "Parquet read error: " << e.what() << std::endl; - return -1; - } - - std::cout << "Parquet Writing and Reading Complete" << std::endl; + std::cout << "Example [" << (i+1) << "] Parquet Writing and Reading Complete" << std::endl; + } return 0; -} +} \ No newline at end of file From bd96d4307b57ff4fbcf22603e77e75bd28a25511 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 26 Feb 2019 18:23:21 +0700 Subject: [PATCH 010/201] footer plaintext mode --- cpp/src/parquet/file_reader.cc | 44 +++++++++++++++++++--- cpp/src/parquet/file_writer.cc | 61 +++++++++++++++++++++--------- cpp/src/parquet/file_writer.h | 3 +- cpp/src/parquet/metadata.cc | 69 ++++++++++++++++++++++++++++++++-- cpp/src/parquet/metadata.h | 8 +++- cpp/src/parquet/properties.h | 12 ++++-- cpp/src/parquet/thrift.h | 23 +++++++++++- 7 files changed, 187 insertions(+), 33 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 0d7eb066b439..08cef61e3bbd 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -129,7 +129,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if (!file_crypto_metadata_ || !crypto_metadata) { + if ((!file_crypto_metadata_ && !file_metadata_->is_plaintext_mode()) || !crypto_metadata) { encrypted = false; } @@ -144,16 +144,22 @@ class SerializedRowGroup : public RowGroupReader::Contents { // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { - std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); + const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() + ? file_metadata_->footer_signing_key_metadata() + : file_crypto_metadata_->key_metadata(); + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } + Encryption::type algorithm = file_metadata_->is_plaintext_mode() + ? file_metadata_->encryption_algorithm().algorithm + : file_crypto_metadata_->encryption_algorithm().algorithm; + auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->GetAad()); + algorithm, footer_key, file_decryption->GetAad()); return PageReader::Open(stream, col->num_values(), col->compression(), footer_encryption, properties_.memory_pool()); @@ -163,7 +169,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = - std::make_shared(crypto_metadata->path_in_schema()); + std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key std::string column_key = file_decryption->GetColumnKey(column_path, column_key_metadata); @@ -265,7 +271,33 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); + + uint32_t read_metadata_len; + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); + + if (file_metadata_->is_plaintext_mode()) { + if (metadata_len - read_metadata_len != 28) { + throw ParquetException("Invalid parquet file. Cannot verify plaintext mode footer."); + } + // get footer key + std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); + auto file_decryption = properties_.file_decryption(); + if (file_decryption == nullptr) { + throw ParquetException("No decryption properties are provided. Could not verify plaintext footer metadata"); + } + std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + if (footer_key.empty()) { + throw ParquetException("No footer key are provided. Could not verify plaintext footer metadata"); + } + // TODO: aad + auto encryption = std::make_shared( + file_metadata_->encryption_algorithm().algorithm, + footer_key + ); + if (! file_metadata_->verify(encryption, metadata_buffer->data() + read_metadata_len, 28)) { + throw ParquetException("Invalid parquet file. Could not verify plaintext footer metadata"); + } + } } // encryption with encrypted footer else { diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d9c84bb25a60..ff83ded10487 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -267,23 +267,39 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - file_metadata_ = metadata_->Finish(); - auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { + file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); - } else { - uint64_t metadata_start = static_cast(sink_->Tell()); - auto crypto_metadata = metadata_->GetCryptoMetaData(); - WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption); - uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); - sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - - sink_->Write(PARQUET_EMAGIC, 4); + } + else { + if (file_encryption->encrypt_footer()) { + // encrypted footer + file_metadata_ = metadata_->Finish(); + + uint64_t metadata_start = static_cast(sink_->Tell()); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); + uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); + + sink_->Write(PARQUET_EMAGIC, 4); + } + else { + // footer plain mode + EncryptionAlgorithm signing_encryption; + signing_encryption.algorithm = Encryption::AES_GCM_V1; + // TODO: AAD + file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->footer_key_metadata()); + + std::shared_ptr footer_encryption = + file_encryption->GetFooterEncryptionProperties(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); + } } sink_->Close(); @@ -390,7 +406,8 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& footer_encryption) { + const std::shared_ptr& footer_encryption, + bool encrypt_footer) { if (footer_encryption == nullptr) { // Write MetaData int64_t position = -1; @@ -405,8 +422,18 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); } else { - // encrypt and write to sink - file_metadata.WriteTo(sink, footer_encryption); + if (encrypt_footer) { + // encrypt and write to sink + file_metadata.WriteTo(sink, footer_encryption); + } + else { + uint32_t metadata_len = static_cast(sink->Tell()); + file_metadata.WriteTo(sink, footer_encryption); + metadata_len = static_cast(sink->Tell()) - metadata_len; + + sink->Write(reinterpret_cast(&metadata_len), 4); + sink->Write(PARQUET_MAGIC, 4); + } } } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index febcaec410e1..7932d1968beb 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -95,7 +95,8 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - const std::shared_ptr& encryption_properties = NULLPTR); + const std::shared_ptr& encryption_properties = NULLPTR, + bool encrypt_footer = false); void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 885135db6a03..69a71f3fdc29 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -423,6 +423,22 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } + bool verify(std::shared_ptr encryption, + const void* tail, uint32_t tail_len) { + // re-encrypt the footer + uint8_t* encrypted_file_metadata; + uint32_t encrypted_file_metadata_len; + ThriftSerializer serializer; + serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, + &encrypted_file_metadata, encryption); + // compare + if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len, + reinterpret_cast(tail), tail_len)) { + return false; + } + return true; + } + inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } inline int64_t num_rows() const { return metadata_->num_rows; } @@ -434,12 +450,32 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } + inline bool is_plaintext_mode() const { return metadata_->__isset.encryption_algorithm; } + inline EncryptionAlgorithm encryption_algorithm() { + return FromThrift(metadata_->encryption_algorithm); + } + inline const std::string& footer_signing_key_metadata() { + return metadata_->footer_signing_key_metadata; + } const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { ThriftSerializer serializer; - serializer.Serialize(metadata_.get(), dst, encryption, false); + if (is_plaintext_mode()) { + serializer.Serialize(metadata_.get(), dst); + // 1. encrypt the footer key + uint8_t* encrypted_file_metadata; + uint32_t encrypted_file_metadata_len; + ThriftSerializer serializer; + serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, + &encrypted_file_metadata, encryption); + // 2. write 28 bytes of nonce_and_tag (at the end of encrypted file metadata) + dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-28, 28); + } + else { + serializer.Serialize(metadata_.get(), dst, encryption, false); + } } std::unique_ptr RowGroup(int i) { @@ -545,6 +581,11 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } +bool FileMetaData::verify(std::shared_ptr encryption, + const void* tail, uint32_t tail_len) { + return impl_->verify(encryption, tail, tail_len); +} + uint32_t FileMetaData::size() const { return impl_->size(); } int FileMetaData::num_columns() const { return impl_->num_columns(); } @@ -553,6 +594,16 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } +bool FileMetaData::is_plaintext_mode() const { return impl_->is_plaintext_mode(); } + +EncryptionAlgorithm FileMetaData::encryption_algorithm() const { + return impl_->encryption_algorithm(); +} + +const std::string& FileMetaData::footer_signing_key_metadata() const { + return impl_->footer_signing_key_metadata(); +} + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: @@ -1062,7 +1113,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return current_row_group_builder_.get(); } - std::unique_ptr Finish() { + std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm, + const std::string& footer_signing_key_metadata) { int64_t total_rows = 0; for (auto row_group : row_groups_) { total_rows += row_group.num_rows; @@ -1108,6 +1160,13 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; + if (signing_algorithm != NULLPTR) { + metadata_->__set_encryption_algorithm(ToThrift(*signing_algorithm)); + if (footer_signing_key_metadata.size() > 0) { + metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); + } + } + parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), &metadata_->schema); @@ -1176,7 +1235,11 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { return impl_->AppendRowGroup(); } -std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +std::unique_ptr FileMetaDataBuilder::Finish( + const EncryptionAlgorithm* signing_algorithm, + const std::string& footer_signing_key_metadata) { + return impl_->Finish(signing_algorithm, footer_signing_key_metadata); +} std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index d1f1f936a6ee..1e7b67637e10 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -189,6 +189,8 @@ class PARQUET_EXPORT FileMetaData { ~FileMetaData(); + bool verify(std::shared_ptr encryption, + const void* tail, uint32_t tail_len); // file metadata uint32_t size() const; @@ -197,6 +199,9 @@ class PARQUET_EXPORT FileMetaData { int64_t num_rows() const; int num_row_groups() const; + bool is_plaintext_mode() const; + EncryptionAlgorithm encryption_algorithm() const; + const std::string& footer_signing_key_metadata() const; ParquetVersion::type version() const; const std::string& created_by() const; int num_schema_elements() const; @@ -332,7 +337,8 @@ class PARQUET_EXPORT FileMetaDataBuilder { RowGroupMetaDataBuilder* AppendRowGroup(); // Complete the Thrift structure - std::unique_ptr Finish(); + std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm = NULLPTR, + const std::string& footer_signing_key_metadata = ""); // crypto metadata std::unique_ptr GetCryptoMetaData(); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 624ab65bf9c1..51ed22f123d4 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -283,9 +283,10 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - Builder* footer_key(const std::string& key) { + Builder* footer_key(const std::string& key, bool encrypt_footer=true) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); footer_key_ = key; + encrypt_footer_ = encrypt_footer; return this; } @@ -357,13 +358,14 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); } return std::make_shared( - footer_encryption, footer_key_metadata_, aad_metadata_, + footer_encryption, encrypt_footer_, footer_key_metadata_, aad_metadata_, column_properties_, encrypt_the_rest_); } private: Encryption::type algorithm_; std::string footer_key_; + bool encrypt_footer_; std::string footer_key_metadata_; std::string aad_; @@ -374,12 +376,13 @@ class PARQUET_EXPORT FileEncryptionProperties { }; FileEncryptionProperties( - const std::shared_ptr& footer_encryption, + const std::shared_ptr& footer_encryption, bool encrypt_footer, const std::string& footer_key_metadata, const std::string& aad_metadata, const std::map>& column_properties, bool encrypt_the_rest) : footer_encryption_(footer_encryption), + encrypt_footer_(encrypt_footer), footer_key_metadata_(footer_key_metadata), aad_metadata_(aad_metadata), column_properties_(column_properties), @@ -389,6 +392,8 @@ class PARQUET_EXPORT FileEncryptionProperties { return footer_encryption_; } + bool encrypt_footer() const { return encrypt_footer_; } + const std::string& footer_key_metadata() const { return footer_key_metadata_; } const std::string& aad_metadata() const { return aad_metadata_; } @@ -431,6 +436,7 @@ class PARQUET_EXPORT FileEncryptionProperties { private: std::shared_ptr footer_encryption_; + bool encrypt_footer_; std::string footer_key_metadata_; std::string aad_metadata_; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 53f9b84c3a6e..a1d6493a6c6a 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -259,9 +259,28 @@ class ThriftSerializer { /// memory returned is owned by this object and will be invalid when another object /// is serialized. template - void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer, + const std::shared_ptr& encryption = NULLPTR) { SerializeObject(obj); - mem_buffer_->getBuffer(buffer, len); + if (encryption == NULLPTR) { + mem_buffer_->getBuffer(buffer, len); + } + else { + uint8_t* out_buffer; + uint32_t out_length; + mem_buffer_->getBuffer(&out_buffer, &out_length); + // encrypt + std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); + int cipher_buffer_len = parquet_encryption::Encrypt( + encryption, true, out_buffer, out_length, cipher_buffer.data()); + if (cipher_buffer_len > cipher_buffer.size()) { + std::stringstream ss; + ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; + throw ParquetException(ss.str()); + } + *len = cipher_buffer_len; + *buffer = cipher_buffer.data(); + } } template From 6a2a918eadd48e120c66495bf10c32d599f36a38 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 4 Mar 2019 12:56:49 +0700 Subject: [PATCH 011/201] footer plaintext mode example --- .../parquet/low-level-api/encryption-reader-writer.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index b762f46ddfb1..c26d9eee2dde 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -74,11 +74,20 @@ int main(int argc, char** argv) { std::make_shared(FOOTER_ENCRYPTION_KEY); decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); + // plain mode footer = unencrypted footer + parquet::FileEncryptionProperties::Builder file_encryption_builder_3; + file_encryption_builder_3.footer_key(FOOTER_ENCRYPTION_KEY, false); + + std::shared_ptr decryption_properties_3 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); + file_encryption_properties.push_back(file_encryption_builder_3.build()); file_decryption_properties.push_back(decryption_properties_1); file_decryption_properties.push_back(decryption_properties_2); + file_decryption_properties.push_back(decryption_properties_3); for (int i = 0; i < file_encryption_properties.size(); ++i) { /********************************************************************************** From e428fdae7aeb4725151d2663e34a81f1a8905a5e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 7 Mar 2019 14:19:29 +0700 Subject: [PATCH 012/201] fix compiling issue --- .../parquet/low-level-api/encryption-reader-writer.cc | 2 +- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/metadata.cc | 9 +++++---- cpp/src/parquet/thrift.h | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index c26d9eee2dde..76ba264780f4 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { file_decryption_properties.push_back(decryption_properties_2); file_decryption_properties.push_back(decryption_properties_3); - for (int i = 0; i < file_encryption_properties.size(); ++i) { + for (unsigned i = 0; i < file_encryption_properties.size(); ++i) { /********************************************************************************** PARQUET WRITER EXAMPLE **********************************************************************************/ diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 08cef61e3bbd..20614d3c19de 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -272,7 +272,7 @@ class SerializedFile : public ParquetFileReader::Contents { } } - uint32_t read_metadata_len; + uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); if (file_metadata_->is_plaintext_mode()) { diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 69a71f3fdc29..fcac5ecd5cf6 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -427,12 +427,12 @@ class FileMetaData::FileMetaDataImpl { const void* tail, uint32_t tail_len) { // re-encrypt the footer uint8_t* encrypted_file_metadata; - uint32_t encrypted_file_metadata_len; + uint32_t encrypted_file_metadata_len = metadata_len_; ThriftSerializer serializer; serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, &encrypted_file_metadata, encryption); - // compare - if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len, + // compare (not count 4 bytes at the end for length) + if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len - 4, reinterpret_cast(tail), tail_len)) { return false; } @@ -471,7 +471,8 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, &encrypted_file_metadata, encryption); // 2. write 28 bytes of nonce_and_tag (at the end of encrypted file metadata) - dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-28, 28); + // (not count 4 bytes at the end for length) + dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-32, 28); } else { serializer.Serialize(metadata_.get(), dst, encryption, false); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index a1d6493a6c6a..01d163b52a2e 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -271,7 +271,7 @@ class ThriftSerializer { mem_buffer_->getBuffer(&out_buffer, &out_length); // encrypt std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - int cipher_buffer_len = parquet_encryption::Encrypt( + unsigned cipher_buffer_len = parquet_encryption::Encrypt( encryption, true, out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > cipher_buffer.size()) { std::stringstream ss; @@ -302,7 +302,7 @@ class ThriftSerializer { return static_cast(out_length); } else { std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - int cipher_buffer_len = parquet_encryption::Encrypt( + unsigned cipher_buffer_len = parquet_encryption::Encrypt( encryption, true, out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > cipher_buffer.size()) { std::stringstream ss; From 537b5e00325bde1a27fea0a73adfd034464c5fec Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 8 Mar 2019 19:00:31 +0700 Subject: [PATCH 013/201] fix plaintext mode verification --- cpp/src/parquet/metadata.cc | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index fcac5ecd5cf6..8fa4e3652200 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -425,15 +425,23 @@ class FileMetaData::FileMetaDataImpl { bool verify(std::shared_ptr encryption, const void* tail, uint32_t tail_len) { - // re-encrypt the footer - uint8_t* encrypted_file_metadata; - uint32_t encrypted_file_metadata_len = metadata_len_; + // serialize the footer + uint8_t* serialized_data; + uint32_t serialized_len = metadata_len_; ThriftSerializer serializer; - serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, - &encrypted_file_metadata, encryption); - // compare (not count 4 bytes at the end for length) - if (0 != memcmp(encrypted_file_metadata + encrypted_file_metadata_len - tail_len - 4, - reinterpret_cast(tail), tail_len)) { + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt with nonce + uint8_t* nonce = const_cast(reinterpret_cast(tail)); + uint8_t* tag = const_cast(reinterpret_cast(tail)) + 12; + + std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); + uint32_t encrypted_len = parquet_encryption::SignedFooterEncrypt( + serialized_data, serialized_len, encryption->key_bytes(), + encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), + nonce, 12, encrypted_buffer.data()); + + if (0 != memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16)) { return false; } return true; @@ -470,9 +478,10 @@ class FileMetaData::FileMetaDataImpl { ThriftSerializer serializer; serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, &encrypted_file_metadata, encryption); - // 2. write 28 bytes of nonce_and_tag (at the end of encrypted file metadata) - // (not count 4 bytes at the end for length) - dst->Write(encrypted_file_metadata+encrypted_file_metadata_len-32, 28); + // write nonce + dst->Write(encrypted_file_metadata + 4, 12); + // write tag + dst->Write(encrypted_file_metadata + encrypted_file_metadata_len - 16, 16); } else { serializer.Serialize(metadata_.get(), dst, encryption, false); From 5d495c33a49325ed501fb20e8b7bab752c0e9b4e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 12 Mar 2019 14:32:27 +0700 Subject: [PATCH 014/201] fix memory issue when serializing plaintext mode footer --- cpp/src/parquet/metadata.cc | 28 +++++++++++++++------------- cpp/src/parquet/thrift.h | 23 ++--------------------- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8fa4e3652200..43b4cf869422 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -32,6 +32,7 @@ // ARROW-6096: The boost regex library must be used when compiling with gcc < 4.9 #if defined(PARQUET_USE_BOOST_REGEX) +#include #include // IWYU pragma: keep using ::boost::regex; using ::boost::regex_match; @@ -441,10 +442,7 @@ class FileMetaData::FileMetaDataImpl { encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), nonce, 12, encrypted_buffer.data()); - if (0 != memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16)) { - return false; - } - return true; + return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); } inline uint32_t size() const { return metadata_len_; } @@ -471,17 +469,21 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { ThriftSerializer serializer; if (is_plaintext_mode()) { - serializer.Serialize(metadata_.get(), dst); - // 1. encrypt the footer key - uint8_t* encrypted_file_metadata; - uint32_t encrypted_file_metadata_len; - ThriftSerializer serializer; - serializer.SerializeToBuffer(metadata_.get(), &encrypted_file_metadata_len, - &encrypted_file_metadata, encryption); + uint8_t* serialized_data; + uint32_t serialized_len; + serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); + + // encrypt the footer key + std::vector encrypted_data(encryption->CalculateCipherSize(serialized_len)); + unsigned encrypted_len = parquet_encryption::Encrypt( + encryption, true, serialized_data, serialized_len, encrypted_data.data()); + + // write unencrypted footer + dst->Write(serialized_data, serialized_len); // write nonce - dst->Write(encrypted_file_metadata + 4, 12); + dst->Write(encrypted_data.data() + 4, 12); // write tag - dst->Write(encrypted_file_metadata + encrypted_file_metadata_len - 16, 16); + dst->Write(encrypted_data.data() + encrypted_len - 16, 16); } else { serializer.Serialize(metadata_.get(), dst, encryption, false); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 01d163b52a2e..8eb872abbb09 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -259,28 +259,9 @@ class ThriftSerializer { /// memory returned is owned by this object and will be invalid when another object /// is serialized. template - void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer, - const std::shared_ptr& encryption = NULLPTR) { + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { SerializeObject(obj); - if (encryption == NULLPTR) { - mem_buffer_->getBuffer(buffer, len); - } - else { - uint8_t* out_buffer; - uint32_t out_length; - mem_buffer_->getBuffer(&out_buffer, &out_length); - // encrypt - std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - unsigned cipher_buffer_len = parquet_encryption::Encrypt( - encryption, true, out_buffer, out_length, cipher_buffer.data()); - if (cipher_buffer_len > cipher_buffer.size()) { - std::stringstream ss; - ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; - throw ParquetException(ss.str()); - } - *len = cipher_buffer_len; - *buffer = cipher_buffer.data(); - } + mem_buffer_->getBuffer(buffer, len); } template From e1e9470d4fac086c641524789c2eb947294f4da3 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 14 Mar 2019 18:53:06 +0700 Subject: [PATCH 015/201] protection of sensitive metadata --- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/metadata.cc | 96 ++++++++++++++++++++++++---------- cpp/src/parquet/metadata.h | 8 +-- 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 20614d3c19de..b5e94f33d605 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -98,7 +98,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i); + auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption()); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 43b4cf869422..bfca0642ef20 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -174,10 +174,38 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { - const format::ColumnMetaData& meta_data = column->meta_data; - for (auto encoding : meta_data.encodings) { + + metadata_ = column->meta_data; + + if (column->__isset.crypto_metadata) { + format::ColumnCryptoMetaData ccmd = column->crypto_metadata; + + if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + if (file_decryption == NULLPTR) { + throw ParquetException("Cannot decrypt ColumnMetadata. FileDecryptionProperties must be provided."); + } + // should decrypt metadata + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + const std::string& key = file_decryption->GetColumnKey(path, key_metadata); + if (key.empty()) { + throw ParquetException("Cannot decrypt ColumnMetadata. Column encryption key must be provided."); + } + + // TODO: get algorithm from FileCryptoMetadata??? + auto encryption = std::make_shared(Encryption::AES_GCM_V1, key); + + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, encryption, false); + } + } + + for (auto encoding : metadata_.encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; @@ -188,12 +216,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline const std::string& file_path() const { return column_->file_path; } // column metadata - inline Type::type type() const { return FromThrift(column_->meta_data.type); } + inline Type::type type() const { return FromThrift(metadata_.type); } - inline int64_t num_values() const { return column_->meta_data.num_values; } + inline int64_t num_values() const { return metadata_.num_values; } std::shared_ptr path_in_schema() { - return std::make_shared(column_->meta_data.path_in_schema); + return std::make_shared(metadata_.path_in_schema); } // Check if statistics are set and are valid @@ -203,12 +231,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!column_->meta_data.__isset.statistics || + if (!metadata_.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(column_->meta_data, descr_); + possible_stats_ = MakeColumnStats(metadata_, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -220,35 +248,35 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } inline Compression::type compression() const { - return FromThrift(column_->meta_data.codec); + return FromThrift(metadata_.codec); } const std::vector& encodings() const { return encodings_; } inline bool has_dictionary_page() const { - return column_->meta_data.__isset.dictionary_page_offset; + return metadata_.__isset.dictionary_page_offset; } inline int64_t dictionary_page_offset() const { - return column_->meta_data.dictionary_page_offset; + return metadata_.dictionary_page_offset; } - inline int64_t data_page_offset() const { return column_->meta_data.data_page_offset; } + inline int64_t data_page_offset() const { return metadata_.data_page_offset; } inline bool has_index_page() const { - return column_->meta_data.__isset.index_page_offset; + return metadata_.__isset.index_page_offset; } inline int64_t index_page_offset() const { - return column_->meta_data.index_page_offset; + return metadata_.index_page_offset; } inline int64_t total_compressed_size() const { - return column_->meta_data.total_compressed_size; + return metadata_.total_compressed_size; } inline int64_t total_uncompressed_size() const { - return column_->meta_data.total_uncompressed_size; + return metadata_.total_uncompressed_size; } inline std::unique_ptr crypto_metadata() const { @@ -264,23 +292,26 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { mutable std::shared_ptr possible_stats_; std::vector encodings_; const format::ColumnChunk* column_; + format::ColumnMetaData metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; }; std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) { + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version)); + new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption)); } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version) + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - writer_version))} {} + writer_version, file_decryption))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -357,7 +388,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i) { + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -365,7 +396,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_); + writer_version_, file_decryption); } private: @@ -396,8 +427,8 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i) const { - return impl_->ColumnChunk(i); +std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption) const { + return impl_->ColumnChunk(i, file_decryption); } // file metadata @@ -913,12 +944,22 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__isset.meta_data = false; // Thrift-serialize the ColumnMetaData structure, - // encrypt it with the column key, and write the result to the output stream - // (first length, then buffer) + // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = properties_->encryption(column_->path()); uint64_t metadata_start = sink->Tell(); - serializer.Serialize(&column_metadata_, sink, encrypt_props); + uint8_t* serialized_data; + uint32_t serialized_len; + serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); + + // encrypt the footer key + std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); + unsigned encrypted_len = parquet_encryption::Encrypt( + encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); + // TODO + const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); + std::string encrypted_column_metadata(temp, encrypted_len); + column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. column_chunk_->__set_file_offset(metadata_start); @@ -936,6 +977,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; + column_metadata_ = column_chunk_->meta_data; column_metadata_.__set_type(ToThrift(column_->physical_type())); column_metadata_.__set_path_in_schema(column_->path()->ToDotVector()); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 1e7b67637e10..dc4b37c7043c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -118,7 +118,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + const ApplicationVersion* writer_version = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR); ~ColumnChunkMetaData(); @@ -147,7 +148,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); + const ApplicationVersion* writer_version = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -168,7 +170,7 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i) const; + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, From 9040fbc3117ebbc6d24963e30973ef741dab1725 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 18 Mar 2019 13:16:41 +0700 Subject: [PATCH 016/201] fix duplication of variable i --- .../parquet/low-level-api/encryption-reader-writer.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 76ba264780f4..66c2d22f584e 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { file_decryption_properties.push_back(decryption_properties_2); file_decryption_properties.push_back(decryption_properties_3); - for (unsigned i = 0; i < file_encryption_properties.size(); ++i) { + for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { /********************************************************************************** PARQUET WRITER EXAMPLE **********************************************************************************/ @@ -110,7 +110,7 @@ int main(int argc, char** argv) { // Add writer properties parquet::WriterProperties::Builder builder; builder.compression(parquet::Compression::SNAPPY); - builder.encryption(file_encryption_properties[i]); + builder.encryption(file_encryption_properties[example_id]); std::shared_ptr props = builder.build(); @@ -226,7 +226,7 @@ int main(int argc, char** argv) { try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption(file_decryption_properties[i]); + reader_properties.file_decryption(file_decryption_properties[example_id]); // Create a ParquetReader instance std::unique_ptr parquet_reader = @@ -453,7 +453,7 @@ int main(int argc, char** argv) { std::cerr << "Parquet read error: " << e.what() << std::endl; } - std::cout << "Example [" << (i+1) << "] Parquet Writing and Reading Complete" << std::endl; + std::cout << "Example [" << (example_id+1) << "] Parquet Writing and Reading Complete" << std::endl; } return 0; } \ No newline at end of file From b177e8c019868a49eb5d89df6c63eb3b31e8289f Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 19 Mar 2019 16:25:13 +0700 Subject: [PATCH 017/201] column metadata encryption: read algorithm, aad from FileCryptoMetadata or FileMetadata --- cpp/src/parquet/file_reader.cc | 10 +++++++++- cpp/src/parquet/metadata.cc | 31 ++++++++++++++++++------------- cpp/src/parquet/metadata.h | 11 ++++++++--- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b5e94f33d605..663c734981f4 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -97,8 +97,16 @@ class SerializedRowGroup : public RowGroupReader::Contents { const ReaderProperties* properties() const override { return &properties_; } std::unique_ptr GetColumnPageReader(int i) override { + EncryptionAlgorithm algorithm; + if (file_crypto_metadata_) { + algorithm = file_crypto_metadata_->encryption_algorithm(); + } + else if (file_metadata_->is_plaintext_mode()) { + algorithm = file_metadata_->encryption_algorithm(); + } // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption()); + auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption(), + &algorithm); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index bfca0642ef20..e58788f2dc34 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -175,7 +175,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption = NULLPTR) + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -196,8 +197,9 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { throw ParquetException("Cannot decrypt ColumnMetadata. Column encryption key must be provided."); } - // TODO: get algorithm from FileCryptoMetadata??? - auto encryption = std::make_shared(Encryption::AES_GCM_V1, key); + DCHECK(algorithm != NULLPTR); + // TODO: AAD + auto encryption = std::make_shared(algorithm->algorithm, key); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), @@ -300,18 +302,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption) { + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption)); + new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption, algorithm)); } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption) + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - writer_version, file_decryption))} {} + writer_version, file_decryption, algorithm))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -388,7 +392,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) { + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -396,7 +401,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryption); + writer_version_, file_decryption, algorithm); } private: @@ -427,8 +432,9 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption) const { - return impl_->ColumnChunk(i, file_decryption); +std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm) const { + return impl_->ColumnChunk(i, file_decryption, algorithm); } // file metadata @@ -952,11 +958,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { uint32_t serialized_len; serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); - // encrypt the footer key std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); unsigned encrypted_len = parquet_encryption::Encrypt( encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); - // TODO + const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index dc4b37c7043c..528680e821cb 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -113,13 +113,16 @@ class PARQUET_EXPORT ColumnCryptoMetaData { std::unique_ptr impl_; }; +class FileCryptoMetaData; + class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR); + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR); ~ColumnChunkMetaData(); @@ -149,7 +152,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR); + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -170,7 +174,8 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR) const; + std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, From 921830c9213f17821b8a4882d9447682f182c4ed Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 20 Mar 2019 19:28:46 +0700 Subject: [PATCH 018/201] keep redacted metadata version for old readers --- cpp/src/parquet/metadata.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e58788f2dc34..bcdd221be547 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -946,13 +946,9 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // key if ((footer_encryption == nullptr && encrypt_md->encrypted()) || !encrypt_md->encrypted_with_footer_key()) { - // don't set meta_data - column_chunk_->__isset.meta_data = false; - // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = properties_->encryption(column_->path()); - uint64_t metadata_start = sink->Tell(); uint8_t* serialized_data; uint32_t serialized_len; @@ -966,8 +962,22 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); - // Set the ColumnMetaData offset at the “file_offset” field in the ColumnChunk. - column_chunk_->__set_file_offset(metadata_start); + // Keep redacted metadata version for old readers + if (footer_encryption == nullptr) { + format::ColumnMetaData metadata_redacted = column_metadata_; + if (metadata_redacted.__isset.statistics) { + metadata_redacted.__isset.statistics = false; + } + if (metadata_redacted.__isset.encoding_stats) { + metadata_redacted.__isset.encoding_stats = false; + } + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(metadata_redacted); + } + else { + // don't set meta_data + column_chunk_->__isset.meta_data = true; + } } else { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); From 4f727f22ac1f3486823d0a5edbc97ee1e4d99a9a Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 20 Mar 2019 20:08:22 +0700 Subject: [PATCH 019/201] hidden column exception --- cpp/src/parquet/encryption.h | 10 +++++++++- cpp/src/parquet/metadata.cc | 8 ++++---- cpp/src/parquet/properties.h | 12 ++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 1dbf0d20f39e..d4d30076cd0a 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -18,10 +18,12 @@ #ifndef PARQUET_ENCRYPTION_H #define PARQUET_ENCRYPTION_H -#include #include #include +#include +#include "parquet/exception.h" + namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { @@ -50,6 +52,12 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { std::map key_map_; }; +class PARQUET_EXPORT HiddenColumnException : public ParquetException { + public: + HiddenColumnException(const std::string columnPath) + : ParquetException(columnPath.c_str()) {} +}; + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index bcdd221be547..ccd5b13918c5 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -191,11 +191,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { // should decrypt metadata std::shared_ptr path = std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); - std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - const std::string& key = file_decryption->GetColumnKey(path, key_metadata); - if (key.empty()) { - throw ParquetException("Cannot decrypt ColumnMetadata. Column encryption key must be provided."); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; std::cout << 1 << std::endl; + if (!file_decryption->HasColumnKey(path, key_metadata)) { + throw HiddenColumnException(path->ToDotString()); } + const std::string& key = file_decryption->GetColumnKey(path, key_metadata); std::cout << 2 << std::endl; DCHECK(algorithm != NULLPTR); // TODO: AAD diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 51ed22f123d4..350141cb2cb8 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -133,6 +133,18 @@ class PARQUET_EXPORT FileDecryptionProperties { column_keys_[columnPath.ToDotString()] = key; } + bool HasColumnKey(const std::shared_ptr& columnPath, + const std::string& key_metadata = "") { + if (key_metadata.empty()) { + auto search = column_keys_.find(columnPath->ToDotString()); + return search != column_keys_.end(); + } + if (key_retriever_ == NULLPTR) { + return false; + } + return key_retriever_->GetKey(key_metadata).empty(); + } + const std::string& GetColumnKey(const std::shared_ptr& columnPath, const std::string& key_metadata = "") { if (key_metadata.empty()) { From 7717ac34859cd3aef6e6b4ca430a323e6f7d2021 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 21 Mar 2019 18:24:42 +0700 Subject: [PATCH 020/201] remove log --- cpp/src/parquet/encryption.h | 2 +- cpp/src/parquet/metadata.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index d4d30076cd0a..028da9d28aeb 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -54,7 +54,7 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { class PARQUET_EXPORT HiddenColumnException : public ParquetException { public: - HiddenColumnException(const std::string columnPath) + HiddenColumnException(const std::string& columnPath) : ParquetException(columnPath.c_str()) {} }; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ccd5b13918c5..0cfd9c3dc836 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -191,11 +191,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { // should decrypt metadata std::shared_ptr path = std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); - std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; std::cout << 1 << std::endl; + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; if (!file_decryption->HasColumnKey(path, key_metadata)) { throw HiddenColumnException(path->ToDotString()); } - const std::string& key = file_decryption->GetColumnKey(path, key_metadata); std::cout << 2 << std::endl; + const std::string& key = file_decryption->GetColumnKey(path, key_metadata); DCHECK(algorithm != NULLPTR); // TODO: AAD From e9ed8c8f3e83cbc33c2eae1be4bbd0effb878e5f Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 21 Mar 2019 18:35:15 +0700 Subject: [PATCH 021/201] add example for hidden column --- .../low-level-api/encryption-reader-writer.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 66c2d22f584e..27e179cfd7f1 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -74,20 +74,30 @@ int main(int argc, char** argv) { std::make_shared(FOOTER_ENCRYPTION_KEY); decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); - // plain mode footer = unencrypted footer + // plaintext mode footer = unencrypted footer parquet::FileEncryptionProperties::Builder file_encryption_builder_3; file_encryption_builder_3.footer_key(FOOTER_ENCRYPTION_KEY, false); std::shared_ptr decryption_properties_3 = std::make_shared(FOOTER_ENCRYPTION_KEY); + // plaintext mode footer, hidden column + parquet::FileEncryptionProperties::Builder file_encryption_builder_4; + file_encryption_builder_4.footer_key(FOOTER_ENCRYPTION_KEY, false); + file_encryption_builder_4.column_properties(encryption_cols, true); // reusing encryption_cols + + std::shared_ptr decryption_properties_4 = + std::make_shared(FOOTER_ENCRYPTION_KEY); + file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); file_encryption_properties.push_back(file_encryption_builder_3.build()); + file_encryption_properties.push_back(file_encryption_builder_4.build()); file_decryption_properties.push_back(decryption_properties_1); file_decryption_properties.push_back(decryption_properties_2); file_decryption_properties.push_back(decryption_properties_3); + file_decryption_properties.push_back(decryption_properties_4); for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { /********************************************************************************** @@ -449,6 +459,8 @@ int main(int argc, char** argv) { i++; } } + } catch (const parquet::HiddenColumnException& e) { + std::cerr << "Parquet read error: hidden column: " << e.what() << std::endl; } catch (const std::exception& e) { std::cerr << "Parquet read error: " << e.what() << std::endl; } From a193daaa467690e85109c6c2c67cde4dca9a82cb Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 29 Mar 2019 18:22:47 +0700 Subject: [PATCH 022/201] handle row group file_offset and total_compressed_size --- cpp/src/parquet/metadata.cc | 29 +++++++++++++++++++++++++++++ cpp/src/parquet/metadata.h | 1 + 2 files changed, 30 insertions(+) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 0cfd9c3dc836..e88bc6360d17 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -390,6 +390,10 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t total_byte_size() const { return row_group_->total_byte_size; } + inline int64_t file_offset() const { return row_group_->file_offset; } + + inline int64_t total_compressed_size() const { return row_group_->total_compressed_size; } + inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, @@ -988,6 +992,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } const ColumnDescriptor* descr() const { return column_; } + int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } private: void Init(format::ColumnChunk* column_chunk) { @@ -1061,6 +1066,10 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) impl_->SetStatistics(result); } +int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { + return impl_->total_compressed_size(); +} + class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, @@ -1109,6 +1118,26 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { // columns"; // row_group_->__set_total_byte_size(total_byte_size); + + int64_t file_offset = 0; + int64_t total_compressed_size = 0; + + for (int i = 0; i < schema_->num_columns(); i++) { + if (!(row_group_->columns[i].file_offset >= 0)) { + std::stringstream ss; + ss << "Column " << i << " is not complete."; + throw ParquetException(ss.str()); + } + if (i == 0) { + file_offset = row_group_->columns[0].file_offset; + } + // sometimes column metadata is encrypted and not available to read, + // so we must get total_compressed_size from column builder + total_compressed_size += column_builders_[i]->total_compressed_size(); + } + + row_group_->__set_file_offset(file_offset); + row_group_->__set_total_compressed_size(total_compressed_size); row_group_->__set_total_byte_size(total_bytes_written); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 528680e821cb..685456d3b5d7 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -282,6 +282,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void SetStatistics(const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; + int64_t total_compressed_size() const; // commit the metadata void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, From e42cc4aab2dd9fdd0ca2f2275c9defa939b525cd Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 15 Apr 2019 13:55:28 +0300 Subject: [PATCH 023/201] Apply API changes --- .../low-level-api/encryption-reader-writer.cc | 69 +- cpp/src/parquet/encryption.h | 6 + cpp/src/parquet/file_reader.cc | 44 +- cpp/src/parquet/file_writer.cc | 22 +- cpp/src/parquet/metadata.cc | 60 +- cpp/src/parquet/properties.h | 769 ++++++++++++------ cpp/src/parquet/schema.h | 7 + cpp/src/parquet/thrift.h | 6 +- cpp/src/parquet/types.h | 18 +- 9 files changed, 654 insertions(+), 347 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 27e179cfd7f1..666c1a07e76e 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -53,51 +53,58 @@ int main(int argc, char** argv) { std::vector> file_decryption_properties; // uniform encryption - parquet::FileEncryptionProperties::Builder file_encryption_builder_1; - file_encryption_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); - - std::shared_ptr decryption_properties_1 = - std::make_shared(FOOTER_ENCRYPTION_KEY); + parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder decryption_properties_builder_1; + decryption_properties_builder_1.withFooterKey(FOOTER_ENCRYPTION_KEY); // non-uniform with column keys - std::map> encryption_cols; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0("ba_field", true); - encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; + std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); + encryption_col_builder_0.withKey(COLUMN_ENCRYPTION_KEY); auto encryption_col0 = encryption_col_builder_0.build(); - encryption_cols[encryption_col0->path()] = encryption_col0; + encryption_cols[path_ptr] = encryption_col0; + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); + decryption_col_builder2.withKey(COLUMN_ENCRYPTION_KEY); + decryption_cols[path_ptr] = decryption_col_builder2.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_2; - file_encryption_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_2.column_properties(encryption_cols, true); + file_encryption_builder_2.withEncryptedColumns(encryption_cols); - std::shared_ptr decryption_properties_2 = - std::make_shared(FOOTER_ENCRYPTION_KEY); - decryption_properties_2->SetColumnKey("ba_field", COLUMN_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder decryption_properties_builder_2; + decryption_properties_builder_2.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_2.withColumnKeys(decryption_cols); - // plaintext mode footer = unencrypted footer - parquet::FileEncryptionProperties::Builder file_encryption_builder_3; - file_encryption_builder_3.footer_key(FOOTER_ENCRYPTION_KEY, false); + // plain mode footer = unencrypted footer + parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); + file_encryption_builder_3.withPlaintextFooter(); - std::shared_ptr decryption_properties_3 = - std::make_shared(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder decryption_properties_builder_3; + decryption_properties_builder_3.withFooterKey(FOOTER_ENCRYPTION_KEY); - // plaintext mode footer, hidden column - parquet::FileEncryptionProperties::Builder file_encryption_builder_4; - file_encryption_builder_4.footer_key(FOOTER_ENCRYPTION_KEY, false); - file_encryption_builder_4.column_properties(encryption_cols, true); // reusing encryption_cols + // plaintext mode footer, hidden column + parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); - std::shared_ptr decryption_properties_4 = - std::make_shared(FOOTER_ENCRYPTION_KEY); + file_encryption_builder_4.withPlaintextFooter(); + file_encryption_builder_4.withEncryptedColumns(encryption_cols); // reusing encryption_cols + parquet::FileDecryptionProperties::Builder decryption_properties_builder_4; + decryption_properties_builder_4.withFooterKey(FOOTER_ENCRYPTION_KEY); file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); file_encryption_properties.push_back(file_encryption_builder_3.build()); file_encryption_properties.push_back(file_encryption_builder_4.build()); - file_decryption_properties.push_back(decryption_properties_1); - file_decryption_properties.push_back(decryption_properties_2); - file_decryption_properties.push_back(decryption_properties_3); - file_decryption_properties.push_back(decryption_properties_4); + file_decryption_properties.push_back(decryption_properties_builder_1.build()); + file_decryption_properties.push_back(decryption_properties_builder_2.build()); + file_decryption_properties.push_back(decryption_properties_builder_3.build()); + file_decryption_properties.push_back(decryption_properties_builder_4.build()); for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { /********************************************************************************** @@ -468,4 +475,4 @@ int main(int argc, char** argv) { std::cout << "Example [" << (example_id+1) << "] Parquet Writing and Reading Complete" << std::endl; } return 0; -} \ No newline at end of file +} diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 028da9d28aeb..3902c4f11bd3 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -58,6 +58,12 @@ class PARQUET_EXPORT HiddenColumnException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; +class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { + public: + KeyAccessDeniedException(const std::string &columnPath) + : ParquetException(columnPath.c_str()) {} +}; + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 663c734981f4..8fb71423cff3 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -156,18 +156,26 @@ class SerializedRowGroup : public RowGroupReader::Contents { ? file_metadata_->footer_signing_key_metadata() : file_crypto_metadata_->key_metadata(); - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + std::string footer_key = file_decryption->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } - Encryption::type algorithm = file_metadata_->is_plaintext_mode() + ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; auto footer_encryption = std::make_shared( - algorithm, footer_key, file_decryption->GetAad()); + algorithm, footer_key); return PageReader::Open(stream, col->num_values(), col->compression(), footer_encryption, properties_.memory_pool()); @@ -179,16 +187,18 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key - std::string column_key = - file_decryption->GetColumnKey(column_path, column_key_metadata); - + std::string column_key; + if (column_key_metadata.empty()) + column_key = file_decryption->getColumnKey(column_path); + else if (file_decryption->getKeyRetriever() != nullptr) + column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); + if (column_key.empty()) { throw ParquetException("column is encrypted with null key, path=" + column_path->ToDotString()); } auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, column_key, - file_decryption->GetAad()); + file_crypto_metadata_->encryption_algorithm().algorithm, column_key); return PageReader::Open(stream, col->num_values(), col->compression(), column_encryption, properties_.memory_pool()); @@ -293,7 +303,12 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_decryption == nullptr) { throw ParquetException("No decryption properties are provided. Could not verify plaintext footer metadata"); } - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + std::string footer_key; + if (footer_key_metadata.empty()) + footer_key = file_decryption->getFooterKey(); + else if (file_decryption->getKeyRetriever() != nullptr) + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key.empty()) { throw ParquetException("No footer key are provided. Could not verify plaintext footer metadata"); } @@ -352,14 +367,19 @@ class SerializedFile : public ParquetFileReader::Contents { auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { throw ParquetException("No decryption properties are provided. Could not read encrypted footer metadata"); - } - std::string footer_key = file_decryption->GetFooterKey(footer_key_metadata); + } + std::string footer_key; + if (footer_key_metadata.empty()) + footer_key = file_decryption->getFooterKey(); + else if (file_decryption->getKeyRetriever() != nullptr) + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key.size() == 0) { throw ParquetException("Invalid footer encryption key. Could not parse footer metadata"); } auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->GetAad()); + file_decryption->getAADPrefix()); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_encryption); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ff83ded10487..0a33aff7b917 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -273,7 +273,7 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*file_metadata_, sink_.get()); } else { - if (file_encryption->encrypt_footer()) { + if (file_encryption->encryptedFooter()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -281,23 +281,27 @@ class FileSerializer : public ParquetFileWriter::Contents { auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + // TODO: Fix AAD calculation std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + std::make_shared(algorithm, + file_encryption->getFooterEncryptionKey()); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - + sink_->Write(PARQUET_EMAGIC, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; - signing_encryption.algorithm = Encryption::AES_GCM_V1; - // TODO: AAD - file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->footer_key_metadata()); - - std::shared_ptr footer_encryption = - file_encryption->GetFooterEncryptionProperties(); + signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; + file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->getFooterSigningKeyMetadata ()); + // TODO: Fix AAD calculation + ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + std::shared_ptr footer_encryption = + std::make_shared(algorithm, + file_encryption->getFooterSigningKey()); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e88bc6360d17..835110ce70a7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -190,29 +190,42 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } // should decrypt metadata std::shared_ptr path = std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - if (!file_decryption->HasColumnKey(path, key_metadata)) { - throw HiddenColumnException(path->ToDotString()); - } - const std::string& key = file_decryption->GetColumnKey(path, key_metadata); - - DCHECK(algorithm != NULLPTR); - // TODO: AAD - auto encryption = std::make_shared(algorithm->algorithm, key); - - uint32_t len = static_cast(column->encrypted_column_metadata.size()); - DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, encryption, false); + std::string key = file_decryption->getColumnKey(path); + // No explicit column key given via API. Retrieve via key metadata. + if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ + try { + key = file_decryption->getKeyRetriever()->GetKey(key_metadata); + } catch (KeyAccessDeniedException e) { + // Hidden column: encrypted, but key unavailable + throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + } + if (key.empty ()) + throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + } + + if (key.empty()) { + // Hidden column: encrypted, but key unavailable + throw HiddenColumnException("HiddenColumnException path= " + path->ToDotString()); + } + DCHECK(algorithm != NULLPTR); + + // TODO: AAD + auto encryption = std::make_shared(algorithm->algorithm, key); + + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, encryption, false); } } - + for (auto encoding : metadata_.encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; } - + // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } @@ -922,7 +935,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const auto& encrypt_md = properties_->column_encryption_props(column_->path()); // column is unencrypted - if (!encrypt_md || !encrypt_md->encrypted()) { + if (!encrypt_md || !encrypt_md->isEncrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); @@ -932,12 +945,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypted with footer key format::ColumnCryptoMetaData ccmd; - if (encrypt_md->encrypted_with_footer_key()) { + if (encrypt_md->isEncryptedWithFooterKey()) { ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; - eck.__set_key_metadata(encrypt_md->key_metadata()); + eck.__set_key_metadata(encrypt_md->getKeyMetaData()); eck.__set_path_in_schema(column_->path()->ToDotVector()); ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); @@ -948,8 +961,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_encryption == nullptr && encrypt_md->encrypted()) || - !encrypt_md->encrypted_with_footer_key()) { + if ((footer_encryption == nullptr && encrypt_md->isEncrypted()) || + !encrypt_md->isEncryptedWithFooterKey()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = properties_->encryption(column_->path()); @@ -1289,7 +1302,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { // TODO: aad metadata //encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); - std::string key_metadata = file_encryption->footer_key_metadata(); + std::string key_metadata; + if (file_encryption->encryptedFooter()) + key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); + else + key_metadata = file_encryption->getFooterSigningKeyMetadata(); + if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 350141cb2cb8..12db419c5664 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -32,7 +32,7 @@ #include "parquet/schema.h" #include "parquet/types.h" #include "arrow/util/logging.h" - +#include "arrow/util/utf8.h" namespace parquet { struct ParquetVersion { @@ -41,141 +41,361 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 1024; static bool DEFAULT_USE_BUFFERED_STREAM = false; +static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static const std::string NULL_STRING = ""; class PARQUET_EXPORT ColumnEncryptionProperties { public: class Builder { public: - Builder(const std::string& path, bool encrypt) - : path_(path), encrypt_(encrypt), encrypted_with_footer_key_(encrypt) {} - - Builder* key(const std::string& key) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - DCHECK(encrypt_); - encrypted_with_footer_key_ = false; - + // Convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + + // Convenience builder for encrypted columns. + Builder(const std::shared_ptr& path) : Builder(path, true) {} + + // Set a column-specific key. + // If key is not set on an encrypted column, the column will + // be encrypted with the footer key. + // keyBytes Key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); key_ = key; return this; } - Builder* key_metadata(const std::string& key_id) { - DCHECK(!key_id.empty()); - key_metadata_ = key_id; + + // Set a key retrieval metadata. + // use either withKeyMetaData or withKeyID, not both + Builder* withKeyMetaData(const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; return this; } - - Builder* key_id(uint32_t key_id) { - std::string key_metadata = std::string(reinterpret_cast(&key_id), 4); - this->key_metadata(key_metadata); + + // Set a key retrieval metadata (converted from String). + // use either withKeyMetaData or withKeyID, not both + // key_id will be converted to metadata (UTF-8 array). + Builder* withKeyID(std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), + key_id.size())) + throw ParquetException("key id should be in UTF8 encoding"); + + DCHECK(!key_id.empty()); + this->withKeyMetaData(key_id); return this; } - + std::shared_ptr build() { - return std::make_shared( - path_, encrypt_, encrypted_with_footer_key_, key_, key_metadata_); - } - - private: - std::string path_; - bool encrypt_; - bool encrypted_with_footer_key_; + return + std::shared_ptr(new ColumnEncryptionProperties( + encrypted_, + column_path_, + key_, + key_metadata_)); + } + + + private: + const std::shared_ptr column_path_; + bool encrypted_; std::string key_; std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} }; - + + const std::shared_ptr& getPath() { return column_path_; } + bool isEncrypted() const { return encrypted_; } + bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } + const std::string& getKey() const { return key_; } + const std::string& getKeyMetaData() const { return key_metadata_; } + ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - - ColumnEncryptionProperties(const std::string& path, bool encrypt, - bool encrypted_with_footer_key, const std::string& key, - const std::string& key_metadata) - : path_(path), - encrypt_(encrypt), - encrypted_with_footer_key_(encrypted_with_footer_key), - key_(key), - key_metadata_(key_metadata) {} - - const std::string& path() const { return path_; } - bool encrypted() const { return encrypt_; } - bool encrypted_with_footer_key() const { return encrypted_with_footer_key_; } - const std::string& key() const { return key_; } - const std::string& key_metadata() const { return key_metadata_; } - + private: - std::string path_; - bool encrypt_; + const std::shared_ptr column_path_; + bool encrypted_; bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; + + explicit ColumnEncryptionProperties(bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata):column_path_(column_path){ + DCHECK(column_path != nullptr); + if (!encrypted) + DCHECK(key.empty() && key_metadata.empty()); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) + DCHECK(key_metadata.empty()); + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; + } }; - -class PARQUET_EXPORT FileDecryptionProperties { + +class PARQUET_EXPORT ColumnDecryptionProperties { public: - FileDecryptionProperties(const std::string& footer_key) : footer_key_(footer_key) { - DCHECK(footer_key_.length() == 16 || footer_key_.length() == 24 || - footer_key_.length() == 32); - } - - FileDecryptionProperties(const std::shared_ptr& key_retriever) - : key_retriever_(key_retriever) {} - - void SetAad(const std::string& aad) { aad_ = aad; } - - void SetColumnKey(const std::string& name, const std::string& key) { - SetColumnKey(std::vector({name}), key); - } - - void SetColumnKey(const std::vector& paths, const std::string& key) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - - schema::ColumnPath columnPath(paths); - - column_keys_[columnPath.ToDotString()] = key; - } - - bool HasColumnKey(const std::shared_ptr& columnPath, - const std::string& key_metadata = "") { - if (key_metadata.empty()) { - auto search = column_keys_.find(columnPath->ToDotString()); - return search != column_keys_.end(); + class Builder { + public: + // convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name)); + } + + Builder(const std::shared_ptr& path) + : column_path_(path) {} + + // Set an explicit column key. If applied on a file that contains key metadata for this column - + // the metadata will be ignored, the column will be decrypted with this key. + // key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); + key_ = key; + return this; } - if (key_retriever_ == NULLPTR) { - return false; + + std::shared_ptr build() { + return + std::shared_ptr(new ColumnDecryptionProperties(column_path_, + key_)); } - return key_retriever_->GetKey(key_metadata).empty(); + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& getPath() { return column_path_; } + const std::string& getKey() const { return key_; } + + private: + const std::shared_ptr column_path_; + std::string key_; + + // This class is only required for setting explicit column decryption keys - + // to override key retriever (or to provide keys when key metadata and/or + // key retriever are not available) + explicit ColumnDecryptionProperties(const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ + DCHECK(column_path != nullptr); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + key_ = key; } - - const std::string& GetColumnKey(const std::shared_ptr& columnPath, - const std::string& key_metadata = "") { - if (key_metadata.empty()) { - return column_keys_.at(columnPath->ToDotString()); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + // Verifies identity (AAD Prefix) of individual file, or of file collection in a data set. + // Throws exception if an AAD prefix is wrong. + // In a data set, AAD Prefixes should be collected, and then checked for missing files. + virtual void check(std::string aad_prefix) = 0; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class Builder { + public: + Builder(){ + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + } + + // Set an explicit footer key. If applied on a file that contains footer key metadata - + // the metadata will be ignored, the footer will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from key retriever. + //param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* withFooterKey(std::string footer_key) { + if (footer_key.empty ()) { + return this; + } + DCHECK(!footer_key.empty()); + footer_key_ = footer_key; + return this; } - if (key_retriever_ == NULLPTR) { - throw ParquetException("no key retriever is provided for column key metadata"); + + // Set explicit column keys (decryption properties). + // Its also possible to set a key retriever on this property object. Upon file decryption, + // availability of explicit keys is checked before invocation of the retriever callback. + // If an explicit key is available for a footer or a column, its key metadata will + // be ignored. + Builder* withColumnKeys(const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_properties) { + if (column_properties.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = column_properties; + return this; } - return key_retriever_->GetKey(key_metadata); - } - - const std::string& GetFooterKey(const std::string& footer_key_metadata = "") { - if (footer_key_metadata.empty()) { - return footer_key_; + + // Set a key retriever callback. Its also possible to + // set explicit footer or column keys on this file property object. Upon file decryption, + // availability of explicit keys is checked before invocation of the retriever callback. + // If an explicit key is available for a footer or a column, its key metadata will + // be ignored. + Builder* withKeyRetriever(const std::shared_ptr& + key_retriever) { + if (key_retriever == NULLPTR) + return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; + } + + // Skip integrity verification of plaintext footers. + // If not called, integrity of plaintext footers will be checked in runtime, + // and an exception will be thrown in the following situations: + // - footer signing key is not available (not passed, or not found by key retriever) + // - footer content and signature don't match + Builder* withoutFooterSignatureVerification() { + check_plaintext_footer_integrity_ = false; + return this; } - if (key_retriever_ == NULLPTR) { - throw ParquetException("no key retriever is provided for footer key metadata"); + + // Explicitly supply the file AAD prefix. + // A must when a prefix is used for file encryption, but not stored in file. + // If AAD prefix is stored in file, it will be compared to the explicitly + // supplied value and an exception will be thrown if they differ. + Builder* withAADPrefix(std::string aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + + aad_prefix_ = aad_prefix; + return this; } - return key_retriever_->GetKey(footer_key_metadata); + + // Set callback for verification of AAD Prefixes stored in file. + Builder* withAADPrefixVerifier(std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) + return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; + } + + std::shared_ptr build() { + return + std::shared_ptr(new FileDecryptionProperties(footer_key_, + key_retriever_, + check_plaintext_footer_integrity_, + aad_prefix_, + aad_prefix_verifier_, + column_property_map_)); + + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + }; + + const std::string& getColumnKey(const std::shared_ptr& column_path) { + if (column_property_map_.find(column_path) != column_property_map_.end()) { + auto column_prop = column_property_map_[column_path]; + if (column_prop != nullptr) + return column_prop->getKey(); + } + return NULL_STRING; } - const std::string& GetAad() { return aad_; } - + + const std::string& getFooterKey() { + return footer_key_; + } + + const std::string& getAADPrefix() { return aad_prefix_; } + + std::shared_ptr getKeyRetriever() { + return key_retriever_; + } + + bool checkFooterIntegrity() { + return check_plaintext_footer_integrity_; + } + + const std::shared_ptr &getAADPrefixVerifier() { + return aad_prefix_verifier_; + } + private: std::string footer_key_; - std::string aad_; - - std::map column_keys_; - + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + + + FileDecryptionProperties(const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_property_map.size()); + if (!footer_key.empty()) + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + if (footer_key.empty() && check_plaintext_footer_integrity) + DCHECK(NULLPTR != key_retriever); + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_property_map_ = column_property_map; + } + }; - + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) @@ -226,10 +446,9 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; -static constexpr Encryption::type DEFAULT_ENCRYPTION_ALGORITHM = Encryption::AES_GCM_V1; -static constexpr int32_t MAXIMAL_KEY_METADATA_LENGTH = 256; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPT_THE_REST = true; +static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; class PARQUET_EXPORT ColumnProperties { public: @@ -281,181 +500,182 @@ class PARQUET_EXPORT ColumnProperties { class PARQUET_EXPORT FileEncryptionProperties { public: class Builder { - public: - Builder() : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) {} - - Builder(const std::string& key) - : algorithm_(DEFAULT_ENCRYPTION_ALGORITHM) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - footer_key_ = key; - } - - Builder* algorithm(Encryption::type algorithm) { - algorithm_ = algorithm; + public: + Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + // Create files with plaintext footer. + // If not called, the files will be created with encrypted footer (default). + Builder* withPlaintextFooter() { + encrypted_footer_ = false; return this; } - - Builder* footer_key(const std::string& key, bool encrypt_footer=true) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - footer_key_ = key; - encrypt_footer_ = encrypt_footer; + + // Set encryption algorithm. + // If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; return this; } - - Builder* footer_key_metadata(const std::string& key_metadata) { - DCHECK(!footer_key_.empty()); - DCHECK(!key_metadata.empty() && - key_metadata.length() < MAXIMAL_KEY_METADATA_LENGTH); - footer_key_metadata_ = key_metadata; + + // Set a key retrieval metadata (converted from String). + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyID(std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), + key_id.size())) + throw ParquetException("footer key id should be in UTF8 encoding"); + + if (key_id.empty()) + return this; + + return withFooterKeyMetadata(key_id); + } + + // Set a key retrieval metadata. + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) + return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; return this; } - - Builder* aad(const std::string& aad) { - DCHECK(!aad.empty()); - aad_ = aad; + + // Set the file AAD Prefix. + Builder* withAADPrefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) + return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; return this; } - - Builder* aad_metadata(const std::string& aad_metadata) { - DCHECK(!aad_.empty()); - DCHECK(!aad_metadata.empty() && - aad_metadata.length() < MAXIMAL_AAD_METADATA_LENGTH); - aad_metadata_ = aad_metadata; + + // Skip storing AAD Prefix in file. + // If not called, and if AAD Prefix is set, it will be stored. + Builder* withoutAADPrefixStorage() { + store_aad_prefix_in_file_ = false; return this; } - - /** - * encrypt_the_rest will define if other columns (not defined in columns argument) - * will be encrypted or not - * if encrypt_the_rest = true, other columns will be encrypted with footer key - * else, other columns will be unencrypted - */ - Builder* column_properties( - const std::map>& - column_properties, - bool encrypt_the_rest = DEFAULT_ENCRYPT_THE_REST) { - encrypt_the_rest_ = encrypt_the_rest; - column_properties_ = column_properties; - - if (!footer_key_.empty()) { - for (const auto& col : column_properties) { - if (col.second->key().compare(footer_key_) != 0) { - break; - } - } - } else { - if (encrypt_the_rest) { - throw ParquetException("Encrypt the rest with null footer key"); - } - bool all_are_unencrypted = true; - for (const auto& col : column_properties) { - if (col.second->encrypted()) { - if (col.second->key().empty()) { - throw ParquetException("Encrypt column with null footer key"); - } - all_are_unencrypted = false; - } - } - - if (all_are_unencrypted) { - throw ParquetException("Footer and all columns unencrypted"); - } - } + + // Set the list of encrypted columns and their properties (keys etc). + // If not called, all columns will be encrypted with the footer key. + // If called, the file columns not in the list will be left unencrypted. + Builder* withEncryptedColumns(const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + encryptedColumns){ + if (encryptedColumns.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = encryptedColumns; return this; } - + std::shared_ptr build() { - std::shared_ptr footer_encryption; - if (!footer_key_.empty()) { - footer_encryption.reset(new EncryptionProperties(algorithm_, footer_key_, aad_)); - } - return std::make_shared( - footer_encryption, encrypt_footer_, footer_key_metadata_, aad_metadata_, - column_properties_, encrypt_the_rest_); - } - - private: - Encryption::type algorithm_; + return + std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, + footer_key_, + footer_key_metadata_, + encrypted_footer_, + aad_prefix_, + store_aad_prefix_in_file_, + column_property_map_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; std::string footer_key_; - bool encrypt_footer_; std::string footer_key_metadata_; - - std::string aad_; - std::string aad_metadata_; - - std::map> column_properties_; - bool encrypt_the_rest_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; }; - - FileEncryptionProperties( - const std::shared_ptr& footer_encryption, bool encrypt_footer, - const std::string& footer_key_metadata, const std::string& aad_metadata, - const std::map>& - column_properties, - bool encrypt_the_rest) - : footer_encryption_(footer_encryption), - encrypt_footer_(encrypt_footer), - footer_key_metadata_(footer_key_metadata), - aad_metadata_(aad_metadata), - column_properties_(column_properties), - encrypt_the_rest_(encrypt_the_rest) {} - - std::shared_ptr GetFooterEncryptionProperties() { - return footer_encryption_; + + bool encryptedFooter() const { return encrypted_footer_; } + + const EncryptionAlgorithm getAlgorithm() { + return algorithm_; } - - bool encrypt_footer() const { return encrypt_footer_; } - - const std::string& footer_key_metadata() const { return footer_key_metadata_; } - - const std::string& aad_metadata() const { return aad_metadata_; } - - std::shared_ptr GetColumnCryptoMetaData( - const std::shared_ptr& path) { - // non-uniform encryption - std::string path_str = path->ToDotString(); - if (column_properties_.find(path_str) != column_properties_.end()) { - return column_properties_[path_str]; - } - - // encrypted with footer key - if (encrypt_the_rest_) { - return ColumnEncryptionProperties::Builder(path->ToDotString(), true).build(); - } - - // uniform encryption or unencrypted - return ColumnEncryptionProperties::Builder(path->ToDotString(), false).build(); + + const std::string& getFooterEncryptionKey() { + return (encrypted_footer_? footer_key_ : NULL_STRING); } - - std::shared_ptr GetColumnEncryptionProperties( - const std::shared_ptr& path) { - // non-uniform encryption - std::string path_str = path->ToDotString(); - if (column_properties_.find(path_str) != column_properties_.end()) { - return std::make_shared(footer_encryption_->algorithm(), - column_properties_[path_str]->key(), - footer_encryption_->aad()); - } - - // encrypted with footer key - if (encrypt_the_rest_) { - return footer_encryption_; - } - - // uniform encryption or unencrypted + + const std::string& getFooterEncryptionKeyMetadata() { + return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); + } + + const std::string& getFooterSigningKey() { + return (encrypted_footer_? NULL_STRING : footer_key_); + } + + const std::string& getFooterSigningKeyMetadata() { + return (encrypted_footer_? NULL_STRING : footer_key_metadata_); + } + + const std::string& getFileAAD() const { return file_AAD_; } + + std::shared_ptr + getColumnProperties(const std::shared_ptr& column_path) { + if (column_property_map_.size () == 0){ + auto builder = + std::shared_ptr( + new ColumnEncryptionProperties::Builder (column_path)); + return builder->build(); + } + if (column_property_map_.find(column_path) != column_property_map_.end()) + return column_property_map_[column_path]; + return NULLPTR; } - + private: - std::shared_ptr footer_encryption_; - bool encrypt_footer_; + EncryptionAlgorithm algorithm_; // encryption algorithm + std::string footer_key_; // encryption key, should have 16, 24, 32-byte length std::string footer_key_metadata_; - std::string aad_metadata_; - - std::map> column_properties_; - bool encrypt_the_rest_; + bool encrypted_footer_; + std::string file_AAD_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + FileEncryptionProperties(ParquetCipher::type cipher, + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_property_map_(column_property_map){ + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); + + } }; - + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -693,10 +913,16 @@ class PARQUET_EXPORT WriterProperties { if (parquet_file_encryption_ == NULLPTR) { return NULLPTR; } else { - return parquet_file_encryption_->GetFooterEncryptionProperties(); + std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); + //TODO: Fix AAD calculation + if (footer_key.empty()) + footer_key = parquet_file_encryption_->getFooterSigningKey (); + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + footer_key, parquet_file_encryption_->getFileAAD()); + } } - + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -740,24 +966,43 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr column_encryption_props( - const std::shared_ptr& path) const { + std::shared_ptr column_encryption_props(const + std::shared_ptr& path) const { if (parquet_file_encryption_) { - return parquet_file_encryption_->GetColumnCryptoMetaData(path); + return parquet_file_encryption_->getColumnProperties(path); } else { return NULLPTR; } } std::shared_ptr encryption( - const std::shared_ptr& path) const { + const std::shared_ptr& path) const { if (parquet_file_encryption_) { - return parquet_file_encryption_->GetColumnEncryptionProperties(path); + auto column_prop = parquet_file_encryption_->getColumnProperties(path); + if (column_prop == NULLPTR) + return NULLPTR; + if (column_prop->isEncryptedWithFooterKey()) { + //TODO: Fix AAD calculation + if (parquet_file_encryption_->encryptedFooter ()) { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterEncryptionKey(), + parquet_file_encryption_->getFileAAD()); + } else { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterSigningKey(), + parquet_file_encryption_->getFileAAD()); + } + } + + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + column_prop->getKey(), + parquet_file_encryption_->getFileAAD()); + } else { return NULLPTR; } } - + private: explicit WriterProperties( MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 8fb3a5482f49..5647e2a9d864 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -88,6 +88,13 @@ class PARQUET_EXPORT ColumnPath { std::string ToDotString() const; const std::vector& ToDotVector() const; + struct CmpColumnPath { + bool operator()(const std::shared_ptr& a, + const std::shared_ptr& b) const { + return a->ToDotString() < b->ToDotString(); + } + }; + protected: std::vector path_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 8eb872abbb09..26cf7ea0bda3 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -102,11 +102,11 @@ static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encrypt EncryptionAlgorithm encryption_algorithm; if (encryption.__isset.AES_GCM_V1) { - encryption_algorithm.algorithm = Encryption::AES_GCM_V1; + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); } else { - encryption_algorithm.algorithm = Encryption::AES_GCM_CTR_V1; + encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); } return encryption_algorithm; @@ -180,7 +180,7 @@ static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; - if (encryption.algorithm == Encryption::AES_GCM_V1) { + if (encryption.algorithm == ParquetCipher::AES_GCM_V1) { encryption_algorithm.__isset.AES_GCM_V1 = true; encryption_algorithm.AES_GCM_V1 = ToAesGcmV1Thrift(encryption.aad); } else { diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 270e0b62cf8e..a9b857e37fea 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -464,7 +464,7 @@ struct Compression { PARQUET_EXPORT std::unique_ptr<::arrow::util::Codec> GetCodecFromArrow(Compression::type codec); -struct Encryption { +struct ParquetCipher { enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; }; @@ -475,7 +475,7 @@ struct AadMetadata { }; struct EncryptionAlgorithm { - Encryption::type algorithm; + ParquetCipher::type algorithm; AadMetadata aad; }; @@ -490,7 +490,7 @@ class PARQUET_EXPORT EncryptionProperties { public: EncryptionProperties() = default; - EncryptionProperties(Encryption::type algorithm, const std::string& key, + EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, const std::string& aad = "") : algorithm_(algorithm), key_(key), aad_(aad) {} @@ -503,31 +503,31 @@ class PARQUET_EXPORT EncryptionProperties { int aad_length() const { return static_cast(aad_.length()); } uint8_t* aad_bytes() const { return str2bytes(aad_); } - Encryption::type algorithm() const { return algorithm_; } + ParquetCipher::type algorithm() const { return algorithm_; } const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { + if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { return plain_len + 28 + 4; - } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { return plain_len + 16 + 4; } return plain_len; } uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == Encryption::AES_GCM_V1) { + if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { return cipher_len - 28 - 4; - } else if (algorithm_ == Encryption::AES_GCM_CTR_V1) { + } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { return cipher_len - 16 - 4; } return cipher_len; } private: - Encryption::type algorithm_; // encryption algorithm + ParquetCipher::type algorithm_; // encryption algorithm std::string key_; // encryption key, should have 16, 24, 32-byte length std::string aad_; // encryption additional authenticated data }; From 8a80946ba87f3e89c875a6d7bd6c17cf98593965 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 16 Apr 2019 11:20:05 +0300 Subject: [PATCH 024/201] Add AAD calculation --- cpp/src/parquet/column_reader.cc | 88 +++++++++++- cpp/src/parquet/column_reader.h | 4 +- cpp/src/parquet/column_writer.cc | 66 +++++++-- cpp/src/parquet/column_writer.h | 3 +- cpp/src/parquet/encryption.h | 8 +- cpp/src/parquet/file_reader.cc | 227 +++++++++++++++++++++++-------- cpp/src/parquet/file_writer.cc | 52 +++++-- cpp/src/parquet/metadata.cc | 190 +++++++++++++++++--------- cpp/src/parquet/metadata.h | 40 ++++-- cpp/src/parquet/properties.h | 44 +++++- cpp/src/parquet/thrift.h | 15 +- cpp/src/parquet/types.h | 6 +- 12 files changed, 569 insertions(+), 174 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index c240dec8f165..5bef698051a3 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -102,6 +102,14 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; + //reset column_map and fileAAD as default_reader_properties is static but + //can be used when reading parquet file with different reading options. + if (default_reader_properties.column_map() != NULLPTR + && default_reader_properties.column_map()->size () != 0) + default_reader_properties.column_map()->clear(); + if (!default_reader_properties.fileAAD().empty()) + default_reader_properties.set_fileAAD (""); + return default_reader_properties; } @@ -116,16 +124,35 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, + bool column_has_dictionary, int16_t row_group_ordinal, + int16_t column_ordinal, const std::shared_ptr encryption, ::arrow::MemoryPool* pool) - : : stream_(stream), + : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), + first_page_(true), + column_has_dictionary_ (column_has_dictionary), + row_group_ordinal_(row_group_ordinal), + column_ordinal_(column_ordinal), + page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), encryption_(encryption), decryption_buffer_(AllocateBuffer(pool, 0)) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); + if (encryption != NULLPTR) { + DCHECK (!encryption_->fileAAD().empty()); + //prepare the AAD for quick update later + data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + } } // Implement the PageReader interface @@ -143,6 +170,14 @@ class SerializedPageReader : public PageReader { std::unique_ptr<::arrow::util::Codec> decompressor_; std::shared_ptr decompression_buffer_; + bool first_page_; + bool column_has_dictionary_; + int16_t row_group_ordinal_; + int16_t column_ordinal_; + int16_t page_ordinal_; + std::string data_pageAAD_; + std::string data_page_headerAAD_; + // Maximum allowed page size uint32_t max_page_header_size_; @@ -160,9 +195,21 @@ class SerializedPageReader : public PageReader { std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with + bool current_page_is_dictionary = false; + if (column_has_dictionary_ ){ + if (first_page_) { + current_page_is_dictionary = true; + first_page_ = false; + } else + page_ordinal_++; + } else + page_ordinal_++; + + while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; uint32_t allowed_page_size = kDefaultPageHeaderSize; + std::string aad; // Page headers can be very large because of page statistics // We try to deserialize a larger buffer progressively @@ -177,9 +224,20 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, - encryption_); + if (encryption_!= NULLPTR) { + if (current_page_is_dictionary) { + aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption_->aad(data_page_headerAAD_); + } + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, encryption_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -197,7 +255,20 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - + if (encryption_!= NULLPTR){ + DCHECK(!encryption_->fileAAD().empty()); + if (current_page_is_dictionary){ + aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption_->aad(data_pageAAD_); + } + } + // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -285,10 +356,13 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, const std::shared_ptr encryption, + Compression::type codec, bool column_has_dictionary, + int16_t row_group_ordinal, int16_t column_ordinal, + const std::shared_ptr encryption, ::arrow::MemoryPool* pool) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, encryption, pool)); + new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, + row_group_ordinal, column_ordinal, encryption, pool)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 17c5ad6524bb..81479fdd7c36 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -80,7 +80,9 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, const std::shared_ptr& encryption = NULLPTR, + Compression::type codec, bool column_has_dictionary = false, + int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, + const std::shared_ptr& encryption = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 6e359bb49795..2c7f58ce37ed 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -139,7 +139,8 @@ class SerializedPageWriter : public PageWriter { SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, MemoryPool* pool = arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), @@ -149,7 +150,21 @@ class SerializedPageWriter : public PageWriter { data_page_offset_(0), total_uncompressed_size_(0), total_compressed_size_(0), - encryption_(encryption) { + page_ordinal_(0), + row_group_ordinal_(row_group_ordinal), + column_ordinal_(column_chunk_ordinal), + encryption_(encryption){ + if (encryption != NULLPTR) { + //prepare the add for quick update later + data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -176,12 +191,21 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (encryption_.get()) { + encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + encrypted_data_buffer = std::static_pointer_cast( AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); output_data_len = parquet_encryption::Encrypt( encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); + encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } format::PageHeader page_header; @@ -212,9 +236,15 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); - + auto props = metadata_->get_encryption_props(metadata_->descr()->path()); + if (props != nullptr){ + props->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + } // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get()); + metadata_->WriteTo(sink_.get(), props); } /** @@ -241,7 +271,6 @@ class SerializedPageWriter : public PageWriter { int64_t WriteDataPage(const CompressedDataPage& page) override { int64_t uncompressed_size = page.uncompressed_size(); std::shared_ptr compressed_data = page.buffer(); - format::DataPageHeader data_page_header; data_page_header.__set_num_values(page.num_values()); data_page_header.__set_encoding(ToThrift(page.encoding())); @@ -256,11 +285,15 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (encryption_.get()) { + parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption_->aad(data_pageAAD_); encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); output_data_len = parquet_encryption::Encrypt( encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption_->aad(data_page_headerAAD_); } format::PageHeader page_header; @@ -283,6 +316,7 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_ += output_data_len + header_size; num_values_ += page.num_values(); + page_ordinal_++; int64_t current_pos = -1; PARQUET_THROW_NOT_OK(sink_->Tell(¤t_pos)); return current_pos - start_pos; @@ -309,6 +343,11 @@ class SerializedPageWriter : public PageWriter { int64_t data_page_offset_; int64_t total_uncompressed_size_; int64_t total_compressed_size_; + int16_t page_ordinal_; + int16_t row_group_ordinal_; + int16_t column_ordinal_; + std::string data_pageAAD_; + std::string data_page_headerAAD_; std::unique_ptr thrift_serializer_; @@ -323,12 +362,14 @@ class BufferedPageWriter : public PageWriter { BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t current_column_ordinal, MemoryPool* pool = arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, pool)); + new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, + row_group_ordinal, current_column_ordinal, pool)); } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -373,13 +414,18 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, MemoryPool* pool, bool buffered_row_group) { + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, + int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, encryption, metadata, pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, + row_group_ordinal, column_chunk_ordinal, + pool)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, encryption, metadata, pool)); + new SerializedPageWriter(sink, codec, encryption, metadata, + row_group_ordinal, column_chunk_ordinal, + pool)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 7d8a1e6a3057..d190c503a858 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -85,7 +85,8 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, const std::shared_ptr& encryption, - ColumnChunkMetaDataBuilder* metadata, + ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, + int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool buffered_row_group = false); diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3902c4f11bd3..2147900b1c52 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -60,10 +60,16 @@ class PARQUET_EXPORT HiddenColumnException : public ParquetException { class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { public: - KeyAccessDeniedException(const std::string &columnPath) + KeyAccessDeniedException(const std::string& columnPath) : ParquetException(columnPath.c_str()) {} }; +class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { + public: + UnsupportedOperationException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} +}; + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8fb71423cff3..36905839b18f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -38,6 +38,8 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/memory.h" +#include "parquet/util/crypto.h" namespace parquet { @@ -88,7 +90,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { : source_(source), file_metadata_(file_metadata), file_crypto_metadata_(file_crypto_metadata), - properties_(props) { + properties_(props), + row_group_ordinal_((int16_t)row_group_number){ row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -104,10 +107,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { else if (file_metadata_->is_plaintext_mode()) { algorithm = file_metadata_->encryption_algorithm(); } + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> + column_map = properties_.column_map(); // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i, properties_.file_decryption(), - &algorithm); - + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, + properties_.file_decryption(), + &algorithm, properties_.fileAAD(), + column_map); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -143,11 +150,17 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i/*column_ordinal*/, nullptr, properties_.memory_pool()); } // the column is encrypted - + std::string aad = parquet_encryption::createModuleAAD(properties_.fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + (int16_t)i, (int16_t)-1); + auto file_decryption = properties_.file_decryption(); // the column is encrypted with footer key @@ -159,25 +172,27 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string footer_key = file_decryption->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); } if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } - + ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; + auto footer_encryption = std::make_shared( - algorithm, footer_key); + algorithm, footer_key, properties_.fileAAD(), aad); return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, footer_encryption, properties_.memory_pool()); } @@ -188,19 +203,37 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::make_shared(crypto_metadata->path_in_schema()); // encrypted with column key std::string column_key; - if (column_key_metadata.empty()) + // first look if we already got the key from before + if (column_map != NULLPTR && column_map->find(column_path) != column_map->end()) { + column_key = column_map->at(column_path); + } + else { column_key = file_decryption->getColumnKey(column_path); - else if (file_decryption->getKeyRetriever() != nullptr) - column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + file_decryption->getKeyRetriever() != nullptr){ + try { + column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << " HiddenColumnException, path=" + column_path->ToDotString(); + throw HiddenColumnException(ss.str()); + } + } + } if (column_key.empty()) { - throw ParquetException("column is encrypted with null key, path=" + - column_path->ToDotString()); + throw HiddenColumnException("column is encrypted with null key, path=" + + column_path->ToDotString()); } auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, column_key); - + file_crypto_metadata_->encryption_algorithm().algorithm, + column_key, + properties_.fileAAD(), aad); + return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, column_encryption, properties_.memory_pool()); } @@ -210,6 +243,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; + int16_t row_group_ordinal_; }; // ---------------------------------------------------------------------- @@ -295,30 +329,72 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_metadata_->is_plaintext_mode()) { if (metadata_len - read_metadata_len != 28) { - throw ParquetException("Invalid parquet file. Cannot verify plaintext mode footer."); + throw ParquetException("Invalid parquet file. Cannot verify plaintext" + "mode footer."); } // get footer key std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. Could not verify plaintext footer metadata"); + throw ParquetException("No decryption properties are provided. " + "Could not verify plaintext footer metadata"); } - std::string footer_key; - if (footer_key_metadata.empty()) - footer_key = file_decryption->getFooterKey(); - else if (file_decryption->getKeyRetriever() != nullptr) - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - + std::string footer_key = file_decryption->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) throw ParquetException("No footer key or " + "key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << "Footer key: access denied"; + throw ParquetException(ss.str()); + } + } if (footer_key.empty()) { - throw ParquetException("No footer key are provided. Could not verify plaintext footer metadata"); + throw ParquetException("Footer key unavailable. Could not verify plaintext " + "footer metadata"); } - // TODO: aad + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + bool supply_aad_prefix = algo.aad.supply_aad_prefix; + std::string aad_file_unique = algo.aad.aad_file_unique; + std::string aad_prefix = algo.aad.aad_prefix; + if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 + && algo.algorithm != ParquetCipher::AES_GCM_V1) + throw ParquetException("Unsupported algorithm"); + if (!file_decryption->getAADPrefix().empty()) { + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties is not the same"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) { + aad_prefix_verifier->check(aad_prefix); + } + } + if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + throw ParquetException("AAD prefix used for file encryption, but not stored in " + "file and not supplied in decryption properties"); + } + std::string fileAAD; + if (!supply_aad_prefix) + fileAAD = aad_prefix + aad_file_unique; + else + fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + + properties_.set_fileAAD(fileAAD); + std::string aad = parquet_encryption::createFooterAAD(fileAAD); auto encryption = std::make_shared( - file_metadata_->encryption_algorithm().algorithm, - footer_key - ); - if (! file_metadata_->verify(encryption, metadata_buffer->data() + read_metadata_len, 28)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext footer metadata"); + file_metadata_->encryption_algorithm().algorithm, + footer_key, fileAAD, aad); + if (! file_metadata_->verify(encryption, metadata_buffer->data() + + read_metadata_len, 28)) { + throw ParquetException("Invalid parquet file. Could not verify plaintext" + " footer metadata"); } } } @@ -348,40 +424,83 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } - + auto file_decryption = properties_.file_decryption(); + if (file_decryption == nullptr) { + throw ParquetException("No decryption properties are provided. Could not read " + "encrypted footer metadata"); + } + uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); + bool supply_aad_prefix = algo.aad.supply_aad_prefix; + std::string aad_file_unique = algo.aad.aad_file_unique; + std::string aad_prefix = algo.aad.aad_prefix; + if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 + && algo.algorithm != ParquetCipher::AES_GCM_V1) + throw ParquetException("Unsupported algorithm"); + if (!file_decryption->getAADPrefix().empty()) { + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties is not the same"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) { + aad_prefix_verifier->check(aad_prefix); + } + } + if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + throw ParquetException("AAD prefix used for file encryption, but not stored in file " + "and not supplied in decryption properties"); + } + std::string fileAAD; + if (!supply_aad_prefix) + fileAAD = aad_prefix + aad_file_unique; + else + fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + //save fileAAD for later use + properties_.set_fileAAD(fileAAD); + std::string aad = parquet_encryption::createFooterAAD(fileAAD); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; PARQUET_THROW_NOT_OK( source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid encrypted parquet file. Could not read footer metadata bytes."); + throw ParquetException("Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); } - + // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); - auto file_decryption = properties_.file_decryption(); - if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. Could not read encrypted footer metadata"); - } - std::string footer_key; - if (footer_key_metadata.empty()) - footer_key = file_decryption->getFooterKey(); - else if (file_decryption->getKeyRetriever() != nullptr) - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + std::string footer_key = file_decryption->getFooterKey(); + if (footer_key.empty()) { + if (footer_key_metadata.empty()) throw ParquetException("No footer key or " + "key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << "Footer key: access denied"; + throw ParquetException(ss.str()); + } + } - if (footer_key.size() == 0) { - throw ParquetException("Invalid footer encryption key. Could not parse footer metadata"); + if (footer_key.empty()) { + throw ParquetException("Invalid footer encryption key. " + "Could not parse footer metadata"); } auto footer_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, footer_key, - file_decryption->getAADPrefix()); + file_crypto_metadata_->encryption_algorithm().algorithm, + footer_key, + fileAAD, aad); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, - footer_encryption); + footer_encryption); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 0a33aff7b917..c60555cf54b6 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -27,6 +27,8 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" +#include "parquet/util/memory.h" +#include "parquet/util/crypto.h" using arrow::MemoryPool; @@ -81,13 +83,16 @@ inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) { class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, - RowGroupMetaDataBuilder* metadata, + RowGroupMetaDataBuilder* metadata, + int16_t row_group_ordinal, const WriterProperties* properties, bool buffered_row_group = false) + : sink_(sink), metadata_(metadata), properties_(properties), total_bytes_written_(0), closed_(false), + row_group_ordinal_ (row_group_ordinal), next_column_index_(0), num_rows_(0), buffered_row_group_(buffered_row_group) { @@ -128,8 +133,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, // TODO - properties_->memory_pool()); + properties_->encryption(column_descr->path()), col_meta, + row_group_ordinal_, (int16_t)(current_column_index_-1), + properties_->memory_pool()); + column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -184,7 +191,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { // Ensures all columns have been written metadata_->set_num_rows(num_rows_); - metadata_->Finish(total_bytes_written_); + metadata_->Finish(total_bytes_written_, row_group_ordinal_); } } @@ -194,6 +201,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const WriterProperties* properties_; int64_t total_bytes_written_; bool closed_; + int16_t row_group_ordinal_; int next_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; @@ -225,9 +233,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), + PageWriter::Open(sink_, properties_->compression(column_descr->path()), properties_->encryption(column_descr->path()), col_meta, + (int16_t)row_group_ordinal_, + (int16_t)current_column_index_, properties_->memory_pool(), buffered_row_group_); + column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -280,28 +291,37 @@ class FileSerializer : public ParquetFileWriter::Contents { uint64_t metadata_start = static_cast(sink_->Tell()); auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; - // TODO: Fix AAD calculation + + ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); std::shared_ptr footer_encryption = std::make_shared(algorithm, - file_encryption->getFooterEncryptionKey()); + file_encryption->getFooterEncryptionKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - sink_->Write(PARQUET_EMAGIC, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; + EncryptionAlgorithm algo = file_encryption->getAlgorithm(); + + signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; + signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; + if (!algo.aad.supply_aad_prefix) + signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; - file_metadata_ = metadata_->Finish(&signing_encryption, file_encryption->getFooterSigningKeyMetadata ()); - // TODO: Fix AAD calculation - ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; + + file_metadata_ = metadata_->Finish(&signing_encryption, + file_encryption->getFooterSigningKeyMetadata ()); + ParquetCipher::type algorithm = algo.algorithm; + std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); std::shared_ptr footer_encryption = std::make_shared(algorithm, - file_encryption->getFooterSigningKey()); + file_encryption->getFooterSigningKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); } } @@ -327,7 +347,9 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, properties_.get(), buffered_row_group)); + sink_, rg_metadata, (int16_t)(num_row_groups_-1), properties_.get(), + buffered_row_group)); + row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 835110ce70a7..7c0d2dc97ff3 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -174,11 +174,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR) - : column_(column), descr_(descr), writer_version_(writer_version) { - + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, std::string, + schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) + : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; if (column->__isset.crypto_metadata) { @@ -189,43 +193,60 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { throw ParquetException("Cannot decrypt ColumnMetadata. FileDecryptionProperties must be provided."); } // should decrypt metadata - std::shared_ptr path = std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::shared_ptr path = + std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - std::string key = file_decryption->getColumnKey(path); - // No explicit column key given via API. Retrieve via key metadata. - if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ - try { - key = file_decryption->getKeyRetriever()->GetKey(key_metadata); - } catch (KeyAccessDeniedException e) { - // Hidden column: encrypted, but key unavailable - throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + std::string key; + if (column_map != NULLPTR && (column_map->find(path) != column_map->end())) { + //First retrieve the key in column_map if exists + key = column_map->at(path); + } else { + key = file_decryption->getColumnKey(path); + // No explicit column key given via API. Retrieve via key metadata. + if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ + try { + key = file_decryption->getKeyRetriever()->GetKey(key_metadata); + } catch (KeyAccessDeniedException &e) { + // Hidden column: encrypted, but key unavailable + std::stringstream ss; + ss << e.what(); + ss << " HiddenColumnException path=" + path->ToDotString(); + throw HiddenColumnException(ss.str()); + } + if (key.empty ()) + throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + } + if (column_map != NULLPTR) { + // save column key for future use + (*column_map)[path]=key; } - if (key.empty ()) - throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); } - if (key.empty()) { // Hidden column: encrypted, but key unavailable throw HiddenColumnException("HiddenColumnException path= " + path->ToDotString()); } - DCHECK(algorithm != NULLPTR); + DCHECK(algorithm != NULLPTR); + + std::string aad = parquet_encryption::createModuleAAD(fileAAD, + parquet_encryption::ColumnMetaData, + row_group_ordinal, + column_ordinal, (int16_t)-1); - // TODO: AAD - auto encryption = std::make_shared(algorithm->algorithm, key); + auto encryption = std::make_shared(algorithm->algorithm, + key, fileAAD, aad); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), &len, &metadata_, encryption, false); } } - for (auto encoding : metadata_.encodings) { encodings_.push_back(FromThrift(encoding)); } possible_stats_ = nullptr; } - + // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } @@ -314,23 +335,39 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm) { + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, writer_version, file_decryption, algorithm)); + new ColumnChunkMetaData(metadata, descr, + row_group_ordinal, column_ordinal, + writer_version, file_decryption, + algorithm, fileAAD, column_map)); } - + ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm) - : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( - reinterpret_cast(metadata), descr, - writer_version, file_decryption, algorithm))} {} + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map) +: impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + reinterpret_cast(metadata), + descr, + row_group_ordinal, + column_ordinal, + writer_version, + file_decryption, algorithm, + fileAAD, column_map))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} - + // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -409,8 +446,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR) { + std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -418,7 +460,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryption, algorithm); + row_group_ordinal, (int16_t)i, + writer_version_, file_decryption, algorithm, fileAAD, column_map); } private: @@ -449,9 +492,16 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm) const { - return impl_->ColumnChunk(i, file_decryption, algorithm); +std::unique_ptr RowGroupMetaData::ColumnChunk(int i, + int16_t row_group_ordinal, + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, + schema::ColumnPath::CmpColumnPath>> column_map) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, + fileAAD, column_map); } // file metadata @@ -491,11 +541,13 @@ class FileMetaData::FileMetaDataImpl { uint8_t* tag = const_cast(reinterpret_cast(tail)) + 12; std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); - uint32_t encrypted_len = parquet_encryption::SignedFooterEncrypt( - serialized_data, serialized_len, encryption->key_bytes(), - encryption->key_length(), encryption->aad_bytes(), encryption->aad_length(), - nonce, 12, encrypted_buffer.data()); - + uint32_t encrypted_len = + parquet_encryption::SignedFooterEncrypt(serialized_data, serialized_len, + encryption->key_bytes(), + encryption->key_length(), + encryption->aad_bytes(), + encryption->aad_length(), + nonce, 12, encrypted_buffer.data()); return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); } @@ -703,7 +755,13 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryption) const { + if (encryption != nullptr) + encryption->aad(parquet_encryption::createModuleAAD(encryption->fileAAD(), + parquet_encryption::Footer, + (int16_t)-1, (int16_t)-1, + (int16_t)-1)); return impl_->WriteTo(dst, encryption); } @@ -930,7 +988,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_metadata_.__set_encodings(thrift_encodings); } - void WriteTo(::arrow::io::OutputStream* sink) { + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryption) { ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_props(column_->path()); @@ -965,20 +1024,20 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { !encrypt_md->isEncryptedWithFooterKey()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata - auto encrypt_props = properties_->encryption(column_->path()); + auto encrypt_props = encryption; + uint8_t* serialized_data; + uint32_t serialized_len; - uint8_t* serialized_data; - uint32_t serialized_len; serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); + // encrypt the footer key std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); + unsigned encrypted_len = parquet_encryption::Encrypt( encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); - const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); - // Keep redacted metadata version for old readers if (footer_encryption == nullptr) { format::ColumnMetaData metadata_redacted = column_metadata_; @@ -1004,6 +1063,11 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } + std::shared_ptr + get_encryption_props(const std::shared_ptr& path) { + return properties_->encryption(path); + } + const ColumnDescriptor* descr() const { return column_; } int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } @@ -1057,6 +1121,11 @@ void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); } +std::shared_ptr +ColumnChunkMetaDataBuilder::get_encryption_props(const std::shared_ptr& path) { + return impl_->get_encryption_props(path); +} + void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, @@ -1067,10 +1136,11 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, compressed_size, uncompressed_size, has_dictionary, dictionary_fallback); } -void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { - impl_->WriteTo(sink); +void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryption) { + impl_->WriteTo(sink, encryption); } - + const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { return impl_->descr(); } @@ -1109,7 +1179,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { int current_column() { return next_column_ - 1; } - void Finish(int64_t total_bytes_written) { + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { if (!(next_column_ == schema_->num_columns())) { std::stringstream ss; ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns() @@ -1131,10 +1201,8 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { // columns"; // row_group_->__set_total_byte_size(total_byte_size); - int64_t file_offset = 0; int64_t total_compressed_size = 0; - for (int i = 0; i < schema_->num_columns(); i++) { if (!(row_group_->columns[i].file_offset >= 0)) { std::stringstream ss; @@ -1152,6 +1220,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { row_group_->__set_file_offset(file_offset); row_group_->__set_total_compressed_size(total_compressed_size); row_group_->__set_total_byte_size(total_bytes_written); + row_group_->__set_ordinal(row_group_ordinal); } void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; } @@ -1199,8 +1268,8 @@ void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) { impl_->set_num_rows(num_rows); } -void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written) { - impl_->Finish(total_bytes_written); +void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { + impl_->Finish(total_bytes_written, row_group_ordinal); } // file metadata @@ -1296,12 +1365,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption = properties_->file_encryption(); auto footer_encryption = properties_->footer_encryption(); - // build format::FileCryptoMetaData - EncryptionAlgorithm encryption_algorithm; - encryption_algorithm.algorithm = footer_encryption->algorithm(); - // TODO: aad metadata - //encryption_algorithm.aad_metadata = file_encryption->aad_metadata(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(encryption_algorithm)); + crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->getAlgorithm())); std::string key_metadata; if (file_encryption->encryptedFooter()) key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 685456d3b5d7..84b84705ba5f 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -22,10 +22,12 @@ #include #include #include - +#include + #include "arrow/util/key_value_metadata.h" #include "parquet/platform.h" +#include "parquet/schema.h" #include "parquet/properties.h" #include "parquet/types.h" @@ -35,6 +37,7 @@ class ColumnDescriptor; class EncodedStatistics; class Statistics; class SchemaDescriptor; +class FileCryptoMetaData; namespace schema { @@ -113,16 +116,18 @@ class PARQUET_EXPORT ColumnCryptoMetaData { std::unique_ptr impl_; }; -class FileCryptoMetaData; - class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR); + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string &fileAAD = "", + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR); ~ColumnChunkMetaData(); @@ -151,9 +156,14 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR); + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -174,8 +184,14 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR) const; + std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string &fileAAD = "", + std::shared_ptr, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -282,6 +298,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void SetStatistics(const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; + int64_t total_compressed_size() const; // commit the metadata void Finish(int64_t num_values, int64_t dictonary_page_offset, @@ -293,7 +310,10 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { const void* contents() const; // For writing metadata at end of column chunk - void WriteTo(::arrow::io::OutputStream* sink); + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryption = NULLPTR); + std::shared_ptr + get_encryption_props(const std::shared_ptr& path); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, @@ -322,7 +342,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { void set_num_rows(int64_t num_rows); // commit the metadata - void Finish(int64_t total_bytes_written); + void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1); private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 12db419c5664..c306fd89ec8d 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -23,6 +23,9 @@ #include #include +#include +#include + #include "arrow/type.h" #include "parquet/encryption.h" @@ -402,6 +405,10 @@ class PARQUET_EXPORT ReaderProperties { : pool_(pool) { buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM; buffer_size_ = DEFAULT_BUFFER_SIZE; + column_map_ = std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>>(new std::map, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>()); } MemoryPool* memory_pool() const { return pool_; } @@ -419,6 +426,15 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map () { + return column_map_; + } + + const std::string& fileAAD() { return fileAAD_; } + + void set_fileAAD (std::string fileAAD) { fileAAD_ = fileAAD; } + void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -430,6 +446,10 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size_; bool buffered_stream_enabled_; std::shared_ptr file_decryption_; + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between + //ColumnPath and their encryption keys + std::string fileAAD_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -449,6 +469,7 @@ static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOM static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; +static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; class PARQUET_EXPORT ColumnProperties { public: @@ -672,8 +693,27 @@ class PARQUET_EXPORT FileEncryptionProperties { DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); + + uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; + memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); + RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH) ; - } + bool supply_aad_prefix = false; + if (aad_prefix.empty()) + file_AAD_ = aad_file_unique_str; + else { + file_AAD_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } + } }; class PARQUET_EXPORT WriterProperties { @@ -914,7 +954,6 @@ class PARQUET_EXPORT WriterProperties { return NULLPTR; } else { std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); - //TODO: Fix AAD calculation if (footer_key.empty()) footer_key = parquet_file_encryption_->getFooterSigningKey (); return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, @@ -982,7 +1021,6 @@ class PARQUET_EXPORT WriterProperties { if (column_prop == NULLPTR) return NULLPTR; if (column_prop->isEncryptedWithFooterKey()) { - //TODO: Fix AAD calculation if (parquet_file_encryption_->encryptedFooter ()) { return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, parquet_file_encryption_->getFooterEncryptionKey(), diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 26cf7ea0bda3..4189d2c90791 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -164,17 +164,17 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { format::AesGcmV1 aesGcmV1; - aesGcmV1.aad_prefix = aad.aad_prefix; - aesGcmV1.aad_file_unique = aad.aad_file_unique; - aesGcmV1.supply_aad_prefix = aad.supply_aad_prefix; + aesGcmV1.__set_aad_prefix(aad.aad_prefix); + aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); return aesGcmV1; } static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { format::AesGcmCtrV1 aesGcmCtrV1; - aesGcmCtrV1.aad_prefix = aad.aad_prefix; - aesGcmCtrV1.aad_file_unique = aad.aad_file_unique; - aesGcmCtrV1.supply_aad_prefix = aad.supply_aad_prefix; + aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); return aesGcmCtrV1; } @@ -237,9 +237,10 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } + *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); + } } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index a9b857e37fea..81744e46d39a 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -491,8 +491,8 @@ class PARQUET_EXPORT EncryptionProperties { public: EncryptionProperties() = default; EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, - const std::string& aad = "") - : algorithm_(algorithm), key_(key), aad_(aad) {} + const std::string& file_aad, const std::string& aad = "") + : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) {} ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } @@ -507,6 +507,7 @@ class PARQUET_EXPORT EncryptionProperties { const std::string& key() const { return key_; } const std::string& aad() const { return aad_; } + const std::string& fileAAD() const { return file_aad_; } uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { @@ -529,6 +530,7 @@ class PARQUET_EXPORT EncryptionProperties { private: ParquetCipher::type algorithm_; // encryption algorithm std::string key_; // encryption key, should have 16, 24, 32-byte length + std::string file_aad_; std::string aad_; // encryption additional authenticated data }; From 053c2a6069cfb4eabf311f14e535508962c49f41 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 16 Apr 2019 11:31:00 +0300 Subject: [PATCH 025/201] Fix parquet tests to work with the changes required to support AAD --- cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/statistics_test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 2d4711a64079..58c538b9939b 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -246,7 +246,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); auto metadata_accessor = - ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, -1, -1, &app_version); return metadata_accessor->is_stats_set(); } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index bc1953a92f55..0022f16e1db3 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -517,7 +517,7 @@ void AssertStatsSet(const ApplicationVersion& version, const ColumnDescriptor* column, bool expected_is_set) { auto metadata_builder = ColumnChunkMetaDataBuilder::Make(props, column); auto column_chunk = - ColumnChunkMetaData::Make(metadata_builder->contents(), column, &version); + ColumnChunkMetaData::Make(metadata_builder->contents(), column, -1, -1, &version); EncodedStatistics stats; stats.set_is_signed(false); metadata_builder->SetStatistics(stats); From 613056eff25e2a20ff1b5be83fde7d3d07955ea0 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 19 Apr 2019 17:05:47 +0700 Subject: [PATCH 026/201] verify plaintext footer depends on config of decryption properties --- cpp/src/parquet/file_reader.cc | 78 ++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 36905839b18f..ab38d955abfd 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -328,37 +328,11 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); if (file_metadata_->is_plaintext_mode()) { - if (metadata_len - read_metadata_len != 28) { - throw ParquetException("Invalid parquet file. Cannot verify plaintext" - "mode footer."); - } - // get footer key - std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. " - "Could not verify plaintext footer metadata"); - } - std::string footer_key = file_decryption->getFooterKey(); - // ignore footer key metadata if footer key is explicitly set via API - if (footer_key.empty()) { - if (footer_key_metadata.empty()) throw ParquetException("No footer key or " - "key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << e.what(); - ss << "Footer key: access denied"; - throw ParquetException(ss.str()); - } - } - if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify plaintext " - "footer metadata"); + throw ParquetException("No decryption properties are provided"); } + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); bool supply_aad_prefix = algo.aad.supply_aad_prefix; std::string aad_file_unique = algo.aad.aad_file_unique; @@ -386,15 +360,45 @@ class SerializedFile : public ParquetFileReader::Contents { else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - properties_.set_fileAAD(fileAAD); - std::string aad = parquet_encryption::createFooterAAD(fileAAD); - auto encryption = std::make_shared( - file_metadata_->encryption_algorithm().algorithm, - footer_key, fileAAD, aad); - if (! file_metadata_->verify(encryption, metadata_buffer->data() - + read_metadata_len, 28)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext" - " footer metadata"); + properties_.set_fileAAD(fileAAD); + if (file_decryption->checkFooterIntegrity()) { + if (metadata_len - read_metadata_len != 28) { + throw ParquetException("Invalid parquet file. Cannot verify plaintext" + "mode footer."); + } + + // get footer key + std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); + std::string footer_key = file_decryption->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) throw ParquetException("No footer key or " + "key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << e.what(); + ss << "Footer key: access denied"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException("Footer key unavailable. Could not verify plaintext " + "footer metadata"); + } + + std::string aad = parquet_encryption::createFooterAAD(fileAAD); + auto encryption = std::make_shared( + file_metadata_->encryption_algorithm().algorithm, + footer_key, fileAAD, aad); + if (! file_metadata_->verify(encryption, metadata_buffer->data() + + read_metadata_len, 28)) { + throw ParquetException("Invalid parquet file. Could not verify plaintext" + " footer metadata"); + } } } } From a195bd5a13d9e7fb64f4bd4039ab470dc9989359 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 22 Apr 2019 07:54:49 +0300 Subject: [PATCH 027/201] Fix code style --- cpp/src/parquet/column_reader.cc | 71 +++---- cpp/src/parquet/column_writer.cc | 69 ++++--- cpp/src/parquet/encryption.h | 2 +- cpp/src/parquet/file_reader.cc | 215 ++++++++++---------- cpp/src/parquet/file_writer.cc | 67 ++++--- cpp/src/parquet/metadata.cc | 222 +++++++++++---------- cpp/src/parquet/metadata.h | 20 +- cpp/src/parquet/properties.h | 330 +++++++++++++++---------------- cpp/src/parquet/schema.h | 2 +- cpp/src/parquet/types.h | 15 +- 10 files changed, 515 insertions(+), 498 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 5bef698051a3..ccaa215c8b71 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -102,14 +102,14 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; - //reset column_map and fileAAD as default_reader_properties is static but - //can be used when reading parquet file with different reading options. + // reset column_map and fileAAD as default_reader_properties is static but + // can be used when reading parquet file with different reading options. if (default_reader_properties.column_map() != NULLPTR && default_reader_properties.column_map()->size () != 0) default_reader_properties.column_map()->clear(); - if (!default_reader_properties.fileAAD().empty()) + if (!default_reader_properties.fileAAD().empty()) default_reader_properties.set_fileAAD (""); - + return default_reader_properties; } @@ -144,14 +144,16 @@ class SerializedPageReader : public PageReader { if (encryption != NULLPTR) { DCHECK (!encryption_->fileAAD().empty()); //prepare the AAD for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); } } @@ -200,7 +202,7 @@ std::shared_ptr SerializedPageReader::NextPage() { if (first_page_) { current_page_is_dictionary = true; first_page_ = false; - } else + } else page_ordinal_++; } else page_ordinal_++; @@ -224,20 +226,20 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - if (encryption_!= NULLPTR) { - if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - encryption_->aad(aad); - } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - encryption_->aad(data_page_headerAAD_); - } - } - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, encryption_); + if (encryption_!= NULLPTR) { + if (current_page_is_dictionary) { + aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption_->aad(data_page_headerAAD_); + } + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, encryption_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -258,17 +260,18 @@ std::shared_ptr SerializedPageReader::NextPage() { if (encryption_!= NULLPTR){ DCHECK(!encryption_->fileAAD().empty()); if (current_page_is_dictionary){ - aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - encryption_->aad(aad); + aad = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + encryption_->aad(aad); } else { - parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); encryption_->aad(data_pageAAD_); } } - + // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 2c7f58ce37ed..6053e56d376f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -156,14 +156,16 @@ class SerializedPageWriter : public PageWriter { encryption_(encryption){ if (encryption != NULLPTR) { //prepare the add for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - data_page_headerAAD_ = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + data_page_headerAAD_ = parquet_encryption::createModuleAAD( + encryption_->fileAAD(), + parquet_encryption::DataPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -191,10 +193,11 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (encryption_.get()) { - encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + encryption_->aad( + parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPage, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); encrypted_data_buffer = std::static_pointer_cast( AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); @@ -202,10 +205,11 @@ class SerializedPageWriter : public PageWriter { encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - encryption_->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + encryption_->aad( + parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } format::PageHeader page_header; @@ -238,10 +242,11 @@ class SerializedPageWriter : public PageWriter { fallback); auto props = metadata_->get_encryption_props(metadata_->descr()->path()); if (props != nullptr){ - props->aad(parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + props->aad( + parquet_encryption::createModuleAAD(encryption_->fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk metadata_->WriteTo(sink_.get(), props); @@ -292,7 +297,8 @@ class SerializedPageWriter : public PageWriter { encryption_, false, compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, + page_ordinal_); encryption_->aad(data_page_headerAAD_); } @@ -365,11 +371,12 @@ class BufferedPageWriter : public PageWriter { ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t current_column_ordinal, MemoryPool* pool = arrow::default_memory_pool()) - : final_sink_(sink), metadata_(metadata) { - in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, - row_group_ordinal, current_column_ordinal, pool)); + : final_sink_(sink), + metadata_(metadata) { + in_memory_sink_ = CreateOutputStream(pool); + pager_ = std::unique_ptr( + new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, + row_group_ordinal, current_column_ordinal, pool)); } // TODO: nullptr for EncryptionProperties int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -418,14 +425,14 @@ std::unique_ptr PageWriter::Open( int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, encryption, metadata, - row_group_ordinal, column_chunk_ordinal, - pool)); + new BufferedPageWriter(sink, codec, encryption, metadata, + row_group_ordinal, column_chunk_ordinal, + pool)); } else { return std::unique_ptr( new SerializedPageWriter(sink, codec, encryption, metadata, - row_group_ordinal, column_chunk_ordinal, - pool)); + row_group_ordinal, column_chunk_ordinal, + pool)); } } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 2147900b1c52..3a4481bd4aad 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -69,7 +69,7 @@ class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { UnsupportedOperationException(const std::string& columnPath) : ParquetException(columnPath.c_str()) {} }; - + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index ab38d955abfd..7914569eaf13 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -108,13 +108,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { algorithm = file_metadata_->encryption_algorithm(); } std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map = properties_.column_map(); // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, properties_.file_decryption(), - &algorithm, properties_.fileAAD(), - column_map); + &algorithm, + properties_.fileAAD(), + column_map); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -156,10 +157,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // the column is encrypted - std::string aad = parquet_encryption::createModuleAAD(properties_.fileAAD(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - (int16_t)i, (int16_t)-1); + std::string aad = parquet_encryption::createModuleAAD( + properties_.fileAAD(), + parquet_encryption::ColumnMetaData, + row_group_ordinal_, + (int16_t)i, (int16_t)-1); auto file_decryption = properties_.file_decryption(); @@ -172,22 +174,21 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string footer_key = file_decryption->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + footer_key = file_decryption->getKeyRetriever()->GetKey( + footer_key_metadata); } if (footer_key.empty()) { throw ParquetException("column is encrypted with null footer key"); } - ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; - auto footer_encryption = std::make_shared( algorithm, footer_key, properties_.fileAAD(), aad); @@ -196,7 +197,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { footer_encryption, properties_.memory_pool()); } - // file is non-uniform encrypted and the column is encrypted with its own key + // file is non-uniform encrypted and the column + // is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = @@ -204,28 +206,29 @@ class SerializedRowGroup : public RowGroupReader::Contents { // encrypted with column key std::string column_key; // first look if we already got the key from before - if (column_map != NULLPTR && column_map->find(column_path) != column_map->end()) { + if (column_map != NULLPTR + && column_map->find(column_path) != column_map->end()) { column_key = column_map->at(column_path); - } - else { + } else { column_key = file_decryption->getColumnKey(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && - file_decryption->getKeyRetriever() != nullptr){ - try { - column_key = file_decryption->getKeyRetriever()->GetKey(column_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << e.what(); - ss << " HiddenColumnException, path=" + column_path->ToDotString(); - throw HiddenColumnException(ss.str()); - } - } + file_decryption->getKeyRetriever() != nullptr){ + try { + column_key = file_decryption->getKeyRetriever()->GetKey( + column_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + + column_path->ToDotString() + " " + << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } + } } - if (column_key.empty()) { throw HiddenColumnException("column is encrypted with null key, path=" + - column_path->ToDotString()); + column_path->ToDotString()); } auto column_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, @@ -233,7 +236,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.fileAAD(), aad); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, column_encryption, properties_.memory_pool()); } @@ -333,33 +336,34 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided"); } - EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - bool supply_aad_prefix = algo.aad.supply_aad_prefix; - std::string aad_file_unique = algo.aad.aad_file_unique; - std::string aad_prefix = algo.aad.aad_prefix; - if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 - && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); - if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties is not the same"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); - if (aad_prefix_verifier != NULLPTR) { - aad_prefix_verifier->check(aad_prefix); + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + bool supply_aad_prefix = algo.aad.supply_aad_prefix; + std::string aad_file_unique = algo.aad.aad_file_unique; + std::string aad_prefix = algo.aad.aad_prefix; + if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 + && algo.algorithm != ParquetCipher::AES_GCM_V1) + throw ParquetException("Unsupported algorithm"); + if (!file_decryption->getAADPrefix().empty()) { + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and " + "in properties is not the same"); } - } - if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { - throw ParquetException("AAD prefix used for file encryption, but not stored in " - "file and not supplied in decryption properties"); - } - std::string fileAAD; - if (!supply_aad_prefix) - fileAAD = aad_prefix + aad_file_unique; - else - fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) + aad_prefix_verifier->check(aad_prefix); + } + if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + throw ParquetException("AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + std::string fileAAD; + if (!supply_aad_prefix) + fileAAD = aad_prefix + aad_file_unique; + else + fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + properties_.set_fileAAD(fileAAD); if (file_decryption->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { @@ -372,32 +376,31 @@ class SerializedFile : public ParquetFileReader::Contents { std::string footer_key = file_decryption->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) throw ParquetException("No footer key or " - "key metadata"); + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); if (file_decryption->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + footer_key = + file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; - ss << e.what(); - ss << "Footer key: access denied"; + ss << "Footer key: access denied " << e.what() << "\n"; throw ParquetException(ss.str()); } } if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify plaintext " - "footer metadata"); + throw ParquetException("Footer key unavailable. Could not verify " + "plaintext footer metadata"); } - std::string aad = parquet_encryption::createFooterAAD(fileAAD); auto encryption = std::make_shared( - file_metadata_->encryption_algorithm().algorithm, - footer_key, fileAAD, aad); + file_metadata_->encryption_algorithm().algorithm, + footer_key, fileAAD, aad); if (! file_metadata_->verify(encryption, metadata_buffer->data() + read_metadata_len, 28)) { throw ParquetException("Invalid parquet file. Could not verify plaintext" - " footer metadata"); + " footer metadata"); } } } @@ -430,40 +433,40 @@ class SerializedFile : public ParquetFileReader::Contents { } auto file_decryption = properties_.file_decryption(); if (file_decryption == nullptr) { - throw ParquetException("No decryption properties are provided. Could not read " - "encrypted footer metadata"); + throw ParquetException("No decryption properties are provided. Could not read " + "encrypted footer metadata"); } - uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); bool supply_aad_prefix = algo.aad.supply_aad_prefix; std::string aad_file_unique = algo.aad.aad_file_unique; std::string aad_prefix = algo.aad.aad_prefix; if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); + throw ParquetException("Unsupported algorithm"); if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties is not the same"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); - if (aad_prefix_verifier != NULLPTR) { - aad_prefix_verifier->check(aad_prefix); - } + if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties " + "is not the same"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption->getAADPrefixVerifier(); + if (aad_prefix_verifier != NULLPTR) + aad_prefix_verifier->check(aad_prefix); } if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { - throw ParquetException("AAD prefix used for file encryption, but not stored in file " - "and not supplied in decryption properties"); + throw ParquetException("AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } std::string fileAAD; if (!supply_aad_prefix) fileAAD = aad_prefix + aad_file_unique; else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - //save fileAAD for later use + // save fileAAD for later use properties_.set_fileAAD(fileAAD); std::string aad = parquet_encryption::createFooterAAD(fileAAD); @@ -474,37 +477,37 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); if (metadata_buffer->size() != metadata_len) { throw ParquetException("Invalid encrypted parquet file. " - "Could not read footer metadata bytes."); + "Could not read footer metadata bytes."); } - + // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); std::string footer_key = file_decryption->getFooterKey(); if (footer_key.empty()) { - if (footer_key_metadata.empty()) throw ParquetException("No footer key or " - "key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << e.what(); - ss << "Footer key: access denied"; - throw ParquetException(ss.str()); - } + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (file_decryption->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = + file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n";; + throw ParquetException(ss.str()); + } } - if (footer_key.empty()) { - throw ParquetException("Invalid footer encryption key. " - "Could not parse footer metadata"); + throw ParquetException("Invalid footer encryption key. " + "Could not parse footer metadata"); } auto footer_encryption = std::make_shared( file_crypto_metadata_->encryption_algorithm().algorithm, - footer_key, - fileAAD, aad); - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, - footer_encryption); + footer_key, + fileAAD, aad); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), + &metadata_len, + footer_encryption); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index c60555cf54b6..815518d229ce 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -85,8 +85,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, - const WriterProperties* properties, bool buffered_row_group = false) - + const WriterProperties* properties, + bool buffered_row_group = false) : sink_(sink), metadata_(metadata), properties_(properties), @@ -133,10 +133,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, - row_group_ordinal_, (int16_t)(current_column_index_-1), - properties_->memory_pool()); - + properties_->encryption(column_descr->path()), + col_meta, row_group_ordinal_, + (int16_t)(current_column_index_-1), + properties_->memory_pool()); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -233,12 +233,11 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, - (int16_t)row_group_ordinal_, - (int16_t)current_column_index_, - properties_->memory_pool(), buffered_row_group_); - + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + properties_->encryption(column_descr->path()), + col_meta, (int16_t)row_group_ordinal_, + (int16_t)current_column_index_, + properties_->memory_pool(), buffered_row_group_); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -281,9 +280,8 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*file_metadata_, sink_.get()); - } - else { + WriteFileMetaData(*metadata, sink_.get()); + } else { if (file_encryption->encryptedFooter()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -291,37 +289,38 @@ class FileSerializer : public ParquetFileWriter::Contents { uint64_t metadata_start = static_cast(sink_->Tell()); auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - ParquetCipher::type algorithm = file_encryption->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = - std::make_shared(algorithm, - file_encryption->getFooterEncryptionKey(), - file_encryption->getFileAAD(), aad); + + ParquetCipher::type algorithm = + file_encryption->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD( + file_encryption->getFileAAD()); + std::shared_ptr footer_encryption = std::make_shared( + algorithm, + file_encryption->getFooterEncryptionKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(PARQUET_EMAGIC, 4); - } - else { + } else { // footer plain mode EncryptionAlgorithm signing_encryption; EncryptionAlgorithm algo = file_encryption->getAlgorithm(); - signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; - - file_metadata_ = metadata_->Finish(&signing_encryption, - file_encryption->getFooterSigningKeyMetadata ()); - ParquetCipher::type algorithm = algo.algorithm; - std::string aad = parquet_encryption::createFooterAAD(file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = - std::make_shared(algorithm, - file_encryption->getFooterSigningKey(), - file_encryption->getFileAAD(), aad); + file_metadata_ = metadata_->Finish( + &signing_encryption, + file_encryption->getFooterSigningKeyMetadata ()); + ParquetCipher::type algorithm = algo.algorithm; + std::string aad = parquet_encryption::createFooterAAD( + file_encryption->getFileAAD()); + std::shared_ptr footer_encryption = std::make_shared( + algorithm, + file_encryption->getFooterSigningKey(), + file_encryption->getFileAAD(), aad); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 7c0d2dc97ff3..cead71d5ac44 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -172,17 +172,18 @@ const std::string& ColumnCryptoMetaData::key_metadata() const { // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: - explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, - int16_t column_ordinal, - const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, std::string, - schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) - : column_(column), descr_(descr), writer_version_(writer_version) { + explicit ColumnChunkMetaDataImpl( + const format::ColumnChunk* column, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, std::string, + schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) + : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; if (column->__isset.crypto_metadata) { @@ -194,51 +195,60 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } // should decrypt metadata std::shared_ptr path = - std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - std::string key; - if (column_map != NULLPTR && (column_map->find(path) != column_map->end())) { + std::string key; + if (column_map != NULLPTR + && (column_map->find(path) != column_map->end())) { //First retrieve the key in column_map if exists - key = column_map->at(path); - } else { + key = column_map->at(path); + } else { key = file_decryption->getColumnKey(path); // No explicit column key given via API. Retrieve via key metadata. - if (key.empty() && !key_metadata.empty() && file_decryption->getKeyRetriever() != nullptr){ + if (key.empty() && !key_metadata.empty() + && file_decryption->getKeyRetriever() != nullptr){ try { - key = file_decryption->getKeyRetriever()->GetKey(key_metadata); + key = file_decryption->getKeyRetriever()->GetKey(key_metadata); } catch (KeyAccessDeniedException &e) { // Hidden column: encrypted, but key unavailable - std::stringstream ss; - ss << e.what(); - ss << " HiddenColumnException path=" + path->ToDotString(); - throw HiddenColumnException(ss.str()); - } + std::stringstream ss; + ss << "HiddenColumnException path=" + path->ToDotString() + " " + << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } if (key.empty ()) - throw HiddenColumnException("HiddenColumnException path=" + path->ToDotString()); + throw HiddenColumnException("HiddenColumnException path=" + + path->ToDotString()); + } + if (column_map != NULLPTR) { + // save column key for future use + (*column_map)[path]=key; } - if (column_map != NULLPTR) { - // save column key for future use - (*column_map)[path]=key; - } - } - if (key.empty()) { - // Hidden column: encrypted, but key unavailable - throw HiddenColumnException("HiddenColumnException path= " + path->ToDotString()); - } + } + if (key.empty()) { + // Hidden column: encrypted, but key unavailable + throw HiddenColumnException("HiddenColumnException path= " + + path->ToDotString()); + } DCHECK(algorithm != NULLPTR); - std::string aad = parquet_encryption::createModuleAAD(fileAAD, - parquet_encryption::ColumnMetaData, - row_group_ordinal, - column_ordinal, (int16_t)-1); - - auto encryption = std::make_shared(algorithm->algorithm, - key, fileAAD, aad); - - uint32_t len = static_cast(column->encrypted_column_metadata.size()); - DeserializeThriftMsg(reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, encryption, false); + std::string aad = parquet_encryption::createModuleAAD( + fileAAD, + parquet_encryption::ColumnMetaData, + row_group_ordinal, + column_ordinal, (int16_t)-1); + auto encryption = std::make_shared( + algorithm->algorithm, + key, fileAAD, aad); + uint32_t len = + static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg( + reinterpret_cast( + column->encrypted_column_metadata.c_str()), + &len, &metadata_, + encryption, + false); } } for (auto encoding : metadata_.encodings) { @@ -246,7 +256,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } possible_stats_ = nullptr; } - // column chunk inline int64_t file_offset() const { return column_->file_offset; } inline const std::string& file_path() const { return column_->file_path; } @@ -343,31 +352,32 @@ std::unique_ptr ColumnChunkMetaData::Make( std::shared_ptr, std::string, schema::ColumnPath::CmpColumnPath>> column_map) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, - row_group_ordinal, column_ordinal, - writer_version, file_decryption, - algorithm, fileAAD, column_map)); + new ColumnChunkMetaData(metadata, descr, row_group_ordinal, + column_ordinal, writer_version, + file_decryption, algorithm, fileAAD, + column_map)); } - -ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map) -: impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + +ColumnChunkMetaData::ColumnChunkMetaData( + const void* metadata, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, + const std::string& fileAAD, + std::shared_ptr, + std::string, schema::ColumnPath::CmpColumnPath>> column_map) + : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, + file_decryption, algorithm, fileAAD, column_map))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} - // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -446,22 +456,25 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } - std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal, - FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal, + FileDecryptionProperties* file_decryption = NULLPTR, + const EncryptionAlgorithm* algorithm = NULLPTR, + const std::string& fileAAD = "", + std::shared_ptr, + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, algorithm, fileAAD, column_map); + return ColumnChunkMetaData::Make( + &row_group_->columns[i], schema_->Column(i), + row_group_ordinal, (int16_t)i, + writer_version_, file_decryption, algorithm, fileAAD, + column_map); } private: @@ -492,16 +505,13 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -std::unique_ptr RowGroupMetaData::ColumnChunk(int i, - int16_t row_group_ordinal, - FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, - schema::ColumnPath::CmpColumnPath>> column_map) const { +std::unique_ptr RowGroupMetaData::ColumnChunk( + int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, + const EncryptionAlgorithm* algorithm, const std::string& fileAAD, + std::shared_ptr, std::string, + schema::ColumnPath::CmpColumnPath>> column_map) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, - fileAAD, column_map); + fileAAD, column_map); } // file metadata @@ -509,8 +519,9 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = nullptr) + explicit FileMetaDataImpl( + const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& encryption = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, @@ -543,11 +554,11 @@ class FileMetaData::FileMetaDataImpl { std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); uint32_t encrypted_len = parquet_encryption::SignedFooterEncrypt(serialized_data, serialized_len, - encryption->key_bytes(), - encryption->key_length(), - encryption->aad_bytes(), - encryption->aad_length(), - nonce, 12, encrypted_buffer.data()); + encryption->key_bytes(), + encryption->key_length(), + encryption->aad_bytes(), + encryption->aad_length(), + nonce, 12, encrypted_buffer.data()); return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); } @@ -572,7 +583,8 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } - void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption) const { + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryption) const { ThriftSerializer serializer; if (is_plaintext_mode()) { uint8_t* serialized_data; @@ -677,9 +689,10 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len, - const std::shared_ptr& encryption) { +std::shared_ptr FileMetaData::Make( + const void* metadata, + uint32_t* metadata_len, + const std::shared_ptr& encryption) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( new FileMetaData(metadata, metadata_len, encryption)); @@ -756,12 +769,13 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryption) const { + const std::shared_ptr& encryption) const { if (encryption != nullptr) - encryption->aad(parquet_encryption::createModuleAAD(encryption->fileAAD(), - parquet_encryption::Footer, - (int16_t)-1, (int16_t)-1, - (int16_t)-1)); + encryption->aad( + parquet_encryption::createModuleAAD(encryption->fileAAD(), + parquet_encryption::Footer, + (int16_t)-1, (int16_t)-1, + (int16_t)-1)); return impl_->WriteTo(dst, encryption); } @@ -989,7 +1003,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { + const std::shared_ptr& encryption) { ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_props(column_->path()); @@ -1025,8 +1039,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata auto encrypt_props = encryption; - uint8_t* serialized_data; - uint32_t serialized_len; + uint8_t* serialized_data; + uint32_t serialized_len; serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); // encrypt the footer key @@ -1137,10 +1151,10 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { + const std::shared_ptr& encryption) { impl_->WriteTo(sink, encryption); } - + const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { return impl_->descr(); } @@ -1371,7 +1385,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); else key_metadata = file_encryption->getFooterSigningKeyMetadata(); - + if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 84b84705ba5f..e80f90fd175c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -23,7 +23,7 @@ #include #include #include - + #include "arrow/util/key_value_metadata.h" #include "parquet/platform.h" @@ -123,7 +123,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, const std::string &fileAAD = "", std::shared_ptr, @@ -156,14 +156,14 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, + int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, const std::string& fileAAD = "", std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR); + std::string, parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -185,13 +185,13 @@ class PARQUET_EXPORT RowGroupMetaData { // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, - FileDecryptionProperties* file_decryption = NULLPTR, + FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, const std::string &fileAAD = "", std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR) const; + std::string, + parquet::schema::ColumnPath::CmpColumnPath>> + column_map = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -311,7 +311,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption = NULLPTR); + const std::shared_ptr& encryption = NULLPTR); std::shared_ptr get_encryption_props(const std::shared_ptr& path); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c306fd89ec8d..cf7542d49479 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -58,20 +58,20 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // Convenience builder for encrypted columns. Builder(const std::shared_ptr& path) : Builder(path, true) {} - + // Set a column-specific key. // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. Builder* withKey(const std::string& key) { - if (key.empty ()) + if (key.empty ()) return this; - + DCHECK(!key.empty()); key_ = key; return this; } - + // Set a key retrieval metadata. // use either withKeyMetaData or withKeyID, not both Builder* withKeyMetaData(const std::string& key_metadata) { @@ -80,7 +80,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { key_metadata_ = key_metadata; return this; } - + // Set a key retrieval metadata (converted from String). // use either withKeyMetaData or withKeyID, not both // key_id will be converted to metadata (UTF-8 array). @@ -88,14 +88,14 @@ class PARQUET_EXPORT ColumnEncryptionProperties { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) - throw ParquetException("key id should be in UTF8 encoding"); - + key_id.size())) + throw ParquetException("key id should be in UTF8 encoding"); + DCHECK(!key_id.empty()); this->withKeyMetaData(key_id); return this; } - + std::shared_ptr build() { return std::shared_ptr(new ColumnEncryptionProperties( @@ -104,35 +104,32 @@ class PARQUET_EXPORT ColumnEncryptionProperties { key_, key_metadata_)); } - - - private: + + private: const std::shared_ptr column_path_; bool encrypted_; std::string key_; std::string key_metadata_; - + Builder(const std::shared_ptr& path, bool encrypted) : column_path_(path), encrypted_(encrypted) {} }; - const std::shared_ptr& getPath() { return column_path_; } bool isEncrypted() const { return encrypted_; } bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } const std::string& getKey() const { return key_; } const std::string& getKeyMetaData() const { return key_metadata_; } - + ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - + private: const std::shared_ptr column_path_; bool encrypted_; bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; - explicit ColumnEncryptionProperties(bool encrypted, const std::shared_ptr& column_path, const std::string& key, @@ -140,20 +137,20 @@ class PARQUET_EXPORT ColumnEncryptionProperties { DCHECK(column_path != nullptr); if (!encrypted) DCHECK(key.empty() && key_metadata.empty()); - + if (!key.empty()) DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - + encrypted_with_footer_key_ = (encrypted && key.empty()); if (encrypted_with_footer_key_) DCHECK(key_metadata.empty()); - + encrypted_ = encrypted; key_metadata_ = key_metadata; key_ = key; } }; - + class PARQUET_EXPORT ColumnDecryptionProperties { public: class Builder { @@ -162,58 +159,57 @@ class PARQUET_EXPORT ColumnDecryptionProperties { Builder(const std::string name) { Builder(schema::ColumnPath::FromDotString(name)); } - + Builder(const std::shared_ptr& path) : column_path_(path) {} - + // Set an explicit column key. If applied on a file that contains key metadata for this column - // the metadata will be ignored, the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. + // key length must be either 16, 24 or 32 bytes. Builder* withKey(const std::string& key) { - if (key.empty ()) + if (key.empty ()) return this; - + DCHECK(!key.empty()); key_ = key; return this; } - + std::shared_ptr build() { - return - std::shared_ptr(new ColumnDecryptionProperties(column_path_, - key_)); + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); } - + private: const std::shared_ptr column_path_; std::string key_; }; - + ColumnDecryptionProperties() = default; ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - + const std::shared_ptr& getPath() { return column_path_; } const std::string& getKey() const { return key_; } - + private: const std::shared_ptr column_path_; std::string key_; - + // This class is only required for setting explicit column decryption keys - // to override key retriever (or to provide keys when key metadata and/or // key retriever are not available) explicit ColumnDecryptionProperties(const std::shared_ptr& column_path, const std::string& key):column_path_(column_path){ DCHECK(column_path != nullptr); - + if (!key.empty()) DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - + key_ = key; } }; - + class PARQUET_EXPORT AADPrefixVerifier { public: // Verifies identity (AAD Prefix) of individual file, or of file collection in a data set. @@ -221,7 +217,7 @@ class PARQUET_EXPORT AADPrefixVerifier { // In a data set, AAD Prefixes should be collected, and then checked for missing files. virtual void check(std::string aad_prefix) = 0; }; - + class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { @@ -229,7 +225,7 @@ class PARQUET_EXPORT FileDecryptionProperties { Builder(){ check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } - + // Set an explicit footer key. If applied on a file that contains footer key metadata - // the metadata will be ignored, the footer will be decrypted/verified with this key. // If explicit key is not set, footer key will be fetched from key retriever. @@ -242,7 +238,7 @@ class PARQUET_EXPORT FileDecryptionProperties { footer_key_ = footer_key; return this; } - + // Set explicit column keys (decryption properties). // Its also possible to set a key retriever on this property object. Upon file decryption, // availability of explicit keys is checked before invocation of the retriever callback. @@ -254,29 +250,29 @@ class PARQUET_EXPORT FileDecryptionProperties { column_properties) { if (column_properties.size () == 0) return this; - + if (column_property_map_.size () != 0) throw ParquetException("Column properties already set"); - + column_property_map_ = column_properties; return this; } - + // Set a key retriever callback. Its also possible to // set explicit footer or column keys on this file property object. Upon file decryption, // availability of explicit keys is checked before invocation of the retriever callback. // If an explicit key is available for a footer or a column, its key metadata will // be ignored. Builder* withKeyRetriever(const std::shared_ptr& - key_retriever) { + key_retriever) { if (key_retriever == NULLPTR) - return this; - + return this; + DCHECK(key_retriever_ == NULLPTR); key_retriever_ = key_retriever; - return this; + return this; } - + // Skip integrity verification of plaintext footers. // If not called, integrity of plaintext footers will be checked in runtime, // and an exception will be thrown in the following situations: @@ -286,107 +282,105 @@ class PARQUET_EXPORT FileDecryptionProperties { check_plaintext_footer_integrity_ = false; return this; } - + // Explicitly supply the file AAD prefix. // A must when a prefix is used for file encryption, but not stored in file. // If AAD prefix is stored in file, it will be compared to the explicitly // supplied value and an exception will be thrown if they differ. Builder* withAADPrefix(std::string aad_prefix) { if (aad_prefix.empty()) { - return this; + return this; } DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; return this; } - + // Set callback for verification of AAD Prefixes stored in file. Builder* withAADPrefixVerifier(std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; - + DCHECK(aad_prefix_verifier_ == NULLPTR); aad_prefix_verifier_ = aad_prefix_verifier; return this; } - + std::shared_ptr build() { - return - std::shared_ptr(new FileDecryptionProperties(footer_key_, - key_retriever_, - check_plaintext_footer_integrity_, - aad_prefix_, - aad_prefix_verifier_, - column_property_map_)); - - } - + return std::shared_ptr( + new FileDecryptionProperties(footer_key_, + key_retriever_, + check_plaintext_footer_integrity_, + aad_prefix_, + aad_prefix_verifier_, + column_property_map_)); + } + private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; - + std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - }; - + bool check_plaintext_footer_integrity_; + }; + const std::string& getColumnKey(const std::shared_ptr& column_path) { if (column_property_map_.find(column_path) != column_property_map_.end()) { auto column_prop = column_property_map_[column_path]; - if (column_prop != nullptr) - return column_prop->getKey(); - } + if (column_prop != nullptr) + return column_prop->getKey(); + } return NULL_STRING; } - + const std::string& getFooterKey() { return footer_key_; } - + const std::string& getAADPrefix() { return aad_prefix_; } - + std::shared_ptr getKeyRetriever() { return key_retriever_; } - + bool checkFooterIntegrity() { return check_plaintext_footer_integrity_; } - + const std::shared_ptr &getAADPrefixVerifier() { return aad_prefix_verifier_; } - + private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; - + std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; - - + + FileDecryptionProperties(const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - std::string aad_prefix, - std::shared_ptr aad_prefix_verifier, - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_property_map.size()); if (!footer_key.empty()) DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); + footer_key.length() == 32); if (footer_key.empty() && check_plaintext_footer_integrity) DCHECK(NULLPTR != key_retriever); aad_prefix_verifier_ = aad_prefix_verifier; @@ -396,9 +390,9 @@ class PARQUET_EXPORT FileDecryptionProperties { aad_prefix_ = aad_prefix; column_property_map_ = column_property_map; } - + }; - + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) @@ -430,11 +424,11 @@ class PARQUET_EXPORT ReaderProperties { std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map () { return column_map_; } - + const std::string& fileAAD() { return fileAAD_; } void set_fileAAD (std::string fileAAD) { fileAAD_ = fileAAD; } - + void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -448,7 +442,7 @@ class PARQUET_EXPORT ReaderProperties { std::shared_ptr file_decryption_; std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between - //ColumnPath and their encryption keys + //ColumnPath and their encryption keys std::string fileAAD_; }; @@ -527,65 +521,65 @@ class PARQUET_EXPORT FileEncryptionProperties { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } - + // Create files with plaintext footer. // If not called, the files will be created with encrypted footer (default). Builder* withPlaintextFooter() { encrypted_footer_ = false; return this; } - + // Set encryption algorithm. // If not called, files will be encrypted with AES_GCM_V1 (default). Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { parquet_cipher_ = parquet_cipher; return this; } - + // Set a key retrieval metadata (converted from String). // use either withFooterKeyMetaData or withFooterKeyID, not both. Builder* withFooterKeyID(std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) + key_id.size())) throw ParquetException("footer key id should be in UTF8 encoding"); - - if (key_id.empty()) + + if (key_id.empty()) return this; - + return withFooterKeyMetadata(key_id); } - + // Set a key retrieval metadata. // use either withFooterKeyMetaData or withFooterKeyID, not both. Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { if (footer_key_metadata.empty()) return this; - + DCHECK(footer_key_metadata_.empty()); footer_key_metadata_ = footer_key_metadata; return this; } - + // Set the file AAD Prefix. Builder* withAADPrefix(const std::string& aad_prefix) { if (aad_prefix.empty()) return this; - + DCHECK(aad_prefix_.empty()); aad_prefix_ = aad_prefix; store_aad_prefix_in_file_ = true; return this; } - + // Skip storing AAD Prefix in file. // If not called, and if AAD Prefix is set, it will be stored. Builder* withoutAADPrefixStorage() { store_aad_prefix_in_file_ = false; return this; } - + // Set the list of encrypted columns and their properties (keys etc). // If not called, all columns will be encrypted with the footer key. // If called, the file columns not in the list will be left unencrypted. @@ -595,63 +589,62 @@ class PARQUET_EXPORT FileEncryptionProperties { encryptedColumns){ if (encryptedColumns.size () == 0) return this; - + if (column_property_map_.size () != 0) throw ParquetException("Column properties already set"); - + column_property_map_ = encryptedColumns; return this; } - + std::shared_ptr build() { - return - std::shared_ptr(new FileEncryptionProperties( - parquet_cipher_, - footer_key_, - footer_key_metadata_, - encrypted_footer_, - aad_prefix_, - store_aad_prefix_in_file_, - column_property_map_)); - } - + return std::shared_ptr( + new FileEncryptionProperties(parquet_cipher_, + footer_key_, + footer_key_metadata_, + encrypted_footer_, + aad_prefix_, + store_aad_prefix_in_file_, + column_property_map_)); + } + private: ParquetCipher::type parquet_cipher_; bool encrypted_footer_; std::string footer_key_; std::string footer_key_metadata_; - + std::string aad_prefix_; bool store_aad_prefix_in_file_; std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; }; - + bool encryptedFooter() const { return encrypted_footer_; } - + const EncryptionAlgorithm getAlgorithm() { return algorithm_; } - + const std::string& getFooterEncryptionKey() { return (encrypted_footer_? footer_key_ : NULL_STRING); } - + const std::string& getFooterEncryptionKeyMetadata() { return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); } - + const std::string& getFooterSigningKey() { return (encrypted_footer_? NULL_STRING : footer_key_); } - + const std::string& getFooterSigningKeyMetadata() { return (encrypted_footer_? NULL_STRING : footer_key_metadata_); } - + const std::string& getFileAAD() const { return file_AAD_; } - + std::shared_ptr getColumnProperties(const std::shared_ptr& column_path) { if (column_property_map_.size () == 0){ @@ -662,34 +655,34 @@ class PARQUET_EXPORT FileEncryptionProperties { } if (column_property_map_.find(column_path) != column_property_map_.end()) return column_property_map_[column_path]; - + return NULLPTR; } - + private: EncryptionAlgorithm algorithm_; // encryption algorithm std::string footer_key_; // encryption key, should have 16, 24, 32-byte length std::string footer_key_metadata_; bool encrypted_footer_; std::string file_AAD_; - + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_property_map_; - + FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_property_map) + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), - column_property_map_(column_property_map){ + column_property_map_(column_property_map){ DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); @@ -699,7 +692,7 @@ class PARQUET_EXPORT FileEncryptionProperties { RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), AAD_FILE_UNIQUE_LENGTH) ; - + bool supply_aad_prefix = false; if (aad_prefix.empty()) file_AAD_ = aad_file_unique_str; @@ -713,9 +706,9 @@ class PARQUET_EXPORT FileEncryptionProperties { if (!aad_prefix.empty() && store_aad_prefix_in_file) { algorithm_.aad.aad_prefix = aad_prefix; } - } + } }; - + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -955,13 +948,13 @@ class PARQUET_EXPORT WriterProperties { } else { std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); if (footer_key.empty()) - footer_key = parquet_file_encryption_->getFooterSigningKey (); + footer_key = parquet_file_encryption_->getFooterSigningKey (); return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - footer_key, parquet_file_encryption_->getFileAAD()); - + footer_key, parquet_file_encryption_->getFileAAD()); + } } - + inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -1006,7 +999,7 @@ class PARQUET_EXPORT WriterProperties { } std::shared_ptr column_encryption_props(const - std::shared_ptr& path) const { + std::shared_ptr& path) const { if (parquet_file_encryption_) { return parquet_file_encryption_->getColumnProperties(path); } else { @@ -1015,32 +1008,31 @@ class PARQUET_EXPORT WriterProperties { } std::shared_ptr encryption( - const std::shared_ptr& path) const { + const std::shared_ptr& path) const { if (parquet_file_encryption_) { auto column_prop = parquet_file_encryption_->getColumnProperties(path); if (column_prop == NULLPTR) - return NULLPTR; + return NULLPTR; if (column_prop->isEncryptedWithFooterKey()) { - if (parquet_file_encryption_->encryptedFooter ()) { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterEncryptionKey(), - parquet_file_encryption_->getFileAAD()); - } else { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterSigningKey(), - parquet_file_encryption_->getFileAAD()); - } + if (parquet_file_encryption_->encryptedFooter ()) { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterEncryptionKey(), + parquet_file_encryption_->getFileAAD()); + } else { + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, + parquet_file_encryption_->getFooterSigningKey(), + parquet_file_encryption_->getFileAAD()); + } } - + return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - column_prop->getKey(), - parquet_file_encryption_->getFileAAD()); - + column_prop->getKey(), + parquet_file_encryption_->getFileAAD()); } else { return NULLPTR; } } - + private: explicit WriterProperties( MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 5647e2a9d864..a4882fdc2f43 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -90,7 +90,7 @@ class PARQUET_EXPORT ColumnPath { struct CmpColumnPath { bool operator()(const std::shared_ptr& a, - const std::shared_ptr& b) const { + const std::shared_ptr& b) const { return a->ToDotString() < b->ToDotString(); } }; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 81744e46d39a..2cd41833b4b0 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -480,14 +480,6 @@ struct EncryptionAlgorithm { }; class PARQUET_EXPORT EncryptionProperties { - private: - static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); - } - public: EncryptionProperties() = default; EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, @@ -532,6 +524,13 @@ class PARQUET_EXPORT EncryptionProperties { std::string key_; // encryption key, should have 16, 24, 32-byte length std::string file_aad_; std::string aad_; // encryption additional authenticated data + static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); + } + }; // parquet::PageType From b176ba5aaa04a3a778357e6ce396752567c9fa47 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 22 Apr 2019 22:55:19 +0300 Subject: [PATCH 028/201] Code style fixes in properties.h --- cpp/src/parquet/properties.h | 142 ++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 61 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index cf7542d49479..c93a435e8233 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -57,7 +57,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } // Convenience builder for encrypted columns. - Builder(const std::shared_ptr& path) : Builder(path, true) {} + Builder(const std::shared_ptr& path) + : Builder(path, true) {} // Set a column-specific key. // If key is not set on an encrypted column, the column will @@ -87,8 +88,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { Builder* withKeyID(std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); - if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) + const uint8_t *data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) throw ParquetException("key id should be in UTF8 encoding"); DCHECK(!key_id.empty()); @@ -97,12 +98,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } std::shared_ptr build() { - return - std::shared_ptr(new ColumnEncryptionProperties( - encrypted_, - column_path_, - key_, - key_metadata_)); + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, + column_path_, + key_, + key_metadata_)); } private: @@ -130,10 +130,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; - explicit ColumnEncryptionProperties(bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata):column_path_(column_path){ + explicit ColumnEncryptionProperties( + bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata):column_path_(column_path){ DCHECK(column_path != nullptr); if (!encrypted) DCHECK(key.empty() && key_metadata.empty()); @@ -163,8 +164,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { Builder(const std::shared_ptr& path) : column_path_(path) {} - // Set an explicit column key. If applied on a file that contains key metadata for this column - - // the metadata will be ignored, the column will be decrypted with this key. + // Set an explicit column key. If applied on a file that contains + // key metadata for this column the metadata will be ignored, + // the column will be decrypted with this key. // key length must be either 16, 24 or 32 bytes. Builder* withKey(const std::string& key) { if (key.empty ()) @@ -199,8 +201,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { // This class is only required for setting explicit column decryption keys - // to override key retriever (or to provide keys when key metadata and/or // key retriever are not available) - explicit ColumnDecryptionProperties(const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ DCHECK(column_path != nullptr); if (!key.empty()) @@ -212,9 +215,11 @@ class PARQUET_EXPORT ColumnDecryptionProperties { class PARQUET_EXPORT AADPrefixVerifier { public: - // Verifies identity (AAD Prefix) of individual file, or of file collection in a data set. + // Verifies identity (AAD Prefix) of individual file, + // or of file collection in a data set. // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, and then checked for missing files. + // In a data set, AAD Prefixes should be collected, + // and then checked for missing files. virtual void check(std::string aad_prefix) = 0; }; @@ -226,10 +231,12 @@ class PARQUET_EXPORT FileDecryptionProperties { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } - // Set an explicit footer key. If applied on a file that contains footer key metadata - - // the metadata will be ignored, the footer will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from key retriever. - //param footerKey Key length must be either 16, 24 or 32 bytes. + // Set an explicit footer key. If applied on a file that contains + // footer key metadata the metadata will be ignored, the footer + // will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from + // key retriever. + // param footerKey Key length must be either 16, 24 or 32 bytes. Builder* withFooterKey(std::string footer_key) { if (footer_key.empty ()) { return this; @@ -240,10 +247,11 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. Upon file decryption, - // availability of explicit keys is checked before invocation of the retriever callback. - // If an explicit key is available for a footer or a column, its key metadata will - // be ignored. + // Its also possible to set a key retriever on this property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. Builder* withColumnKeys(const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& @@ -259,10 +267,11 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. Upon file decryption, - // availability of explicit keys is checked before invocation of the retriever callback. - // If an explicit key is available for a footer or a column, its key metadata will - // be ignored. + // set explicit footer or column keys on this file property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. Builder* withKeyRetriever(const std::shared_ptr& key_retriever) { if (key_retriever == NULLPTR) @@ -276,7 +285,8 @@ class PARQUET_EXPORT FileDecryptionProperties { // Skip integrity verification of plaintext footers. // If not called, integrity of plaintext footers will be checked in runtime, // and an exception will be thrown in the following situations: - // - footer signing key is not available (not passed, or not found by key retriever) + // - footer signing key is not available + // (not passed, or not found by key retriever) // - footer content and signature don't match Builder* withoutFooterSignatureVerification() { check_plaintext_footer_integrity_ = false; @@ -297,7 +307,8 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set callback for verification of AAD Prefixes stored in file. - Builder* withAADPrefixVerifier(std::shared_ptr aad_prefix_verifier) { + Builder* withAADPrefixVerifier( + std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; @@ -329,7 +340,8 @@ class PARQUET_EXPORT FileDecryptionProperties { bool check_plaintext_footer_integrity_; }; - const std::string& getColumnKey(const std::shared_ptr& column_path) { + const std::string& getColumnKey( + const std::shared_ptr& column_path) { if (column_property_map_.find(column_path) != column_property_map_.end()) { auto column_prop = column_property_map_[column_path]; if (column_prop != nullptr) @@ -369,15 +381,18 @@ class PARQUET_EXPORT FileDecryptionProperties { bool check_plaintext_footer_integrity_; - FileDecryptionProperties(const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - std::string aad_prefix, - std::shared_ptr aad_prefix_verifier, - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { - DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_property_map.size()); + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { + DCHECK(!footer_key.empty() || + NULLPTR != key_retriever || + 0 != column_property_map.size()); if (!footer_key.empty()) DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); @@ -402,7 +417,7 @@ class PARQUET_EXPORT ReaderProperties { column_map_ = std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>>(new std::map, std::string, - parquet::schema::ColumnPath::CmpColumnPath>()); + schema::ColumnPath::CmpColumnPath>()); } MemoryPool* memory_pool() const { return pool_; } @@ -517,7 +532,8 @@ class PARQUET_EXPORT FileEncryptionProperties { class Builder { public: Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } @@ -541,8 +557,8 @@ class PARQUET_EXPORT FileEncryptionProperties { Builder* withFooterKeyID(std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); - if (!::arrow::util::ValidateUTF8(reinterpret_cast(key_id.c_str()), - key_id.size())) + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) throw ParquetException("footer key id should be in UTF8 encoding"); if (key_id.empty()) @@ -583,10 +599,11 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set the list of encrypted columns and their properties (keys etc). // If not called, all columns will be encrypted with the footer key. // If called, the file columns not in the list will be left unencrypted. - Builder* withEncryptedColumns(const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - encryptedColumns){ + Builder* withEncryptedColumns( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + encryptedColumns){ if (encryptedColumns.size () == 0) return this; @@ -645,12 +662,11 @@ class PARQUET_EXPORT FileEncryptionProperties { const std::string& getFileAAD() const { return file_AAD_; } - std::shared_ptr - getColumnProperties(const std::shared_ptr& column_path) { + std::shared_ptr getColumnProperties( + const std::shared_ptr& column_path) { if (column_property_map_.size () == 0){ - auto builder = - std::shared_ptr( - new ColumnEncryptionProperties::Builder (column_path)); + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder (column_path)); return builder->build(); } if (column_property_map_.find(column_path) != column_property_map_.end()) @@ -661,7 +677,7 @@ class PARQUET_EXPORT FileEncryptionProperties { private: EncryptionAlgorithm algorithm_; // encryption algorithm - std::string footer_key_; // encryption key, should have 16, 24, 32-byte length + std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; std::string file_AAD_; @@ -674,7 +690,8 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key, std::string footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::string& aad_prefix, + bool store_aad_prefix_in_file, const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& @@ -685,13 +702,16 @@ class PARQUET_EXPORT FileEncryptionProperties { column_property_map_(column_property_map){ DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); + DCHECK(footer_key.length() == 16 + || footer_key.length() == 24 + || footer_key.length() == 32); uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH) ; + std::string aad_file_unique_str( + reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH) ; bool supply_aad_prefix = false; if (aad_prefix.empty()) From ca1bd2b031272e0c1f490667da9630b5611cc3fc Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Wed, 24 Apr 2019 17:28:51 +0700 Subject: [PATCH 029/201] revert change in parquet.thrift --- cpp/src/parquet/parquet.thrift | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index 288c72a60a64..000b74dde1c8 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -570,7 +570,7 @@ struct PageHeader { /** Uncompressed page size in bytes (not including this header) **/ 2: required i32 uncompressed_page_size - /** Compressed (and potentially encrypted) page size in bytes, not including this header **/ + /** Compressed page size in bytes (not including this header) **/ 3: required i32 compressed_page_size /** 32bit crc for the data below. This allows for disabling checksumming in HDFS @@ -955,7 +955,6 @@ struct FileMetaData { 9: optional binary footer_signing_key_metadata } -/** Crypto metadata for files with encrypted footer **/ struct FileCryptoMetaData { /** * Encryption algorithm. This field is only used for files From a5eee07fa53dbdba7a15fd665909912db768191a Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 29 Apr 2019 08:40:28 +0300 Subject: [PATCH 030/201] Move all encrypted related classes from properties.h to new files: encryption_properties.h and encryption_properties.cc --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/encryption_properties.cc | 176 +++++++ cpp/src/parquet/encryption_properties.h | 498 ++++++++++++++++++++ cpp/src/parquet/properties.h | 575 +---------------------- 4 files changed, 677 insertions(+), 573 deletions(-) create mode 100644 cpp/src/parquet/encryption_properties.cc create mode 100644 cpp/src/parquet/encryption_properties.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e5ecee4a8d27..86fe61d6af00 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -168,6 +168,7 @@ set(PARQUET_SRCS deprecated_io.cc encoding.cc encryption.cc + encryption_properties.cc file_reader.cc file_writer.cc metadata.cc diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc new file mode 100644 index 000000000000..1eb3847ec718 --- /dev/null +++ b/cpp/src/parquet/encryption_properties.cc @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption_properties.h" + +#include + +#include "arrow/util/utf8.h" + +namespace parquet { + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::withKeyID( + std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t *data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) + throw ParquetException("key id should be in UTF8 encoding"); + + DCHECK(!key_id.empty()); + this->withKeyMetaData(key_id); + return this; +} + +ColumnEncryptionProperties::ColumnEncryptionProperties( + bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata):column_path_(column_path){ + DCHECK(column_path != nullptr); + if (!encrypted) + DCHECK(key.empty() && key_metadata.empty()); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) + DCHECK(key_metadata.empty()); + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; +} + +ColumnDecryptionProperties::ColumnDecryptionProperties( + const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ + DCHECK(column_path != nullptr); + + if (!key.empty()) + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + + key_ = key; +} + +const std::string& FileDecryptionProperties::getColumnKey( + const std::shared_ptr& column_path) { + if (column_property_map_.find(column_path) != column_property_map_.end()) { + auto column_prop = column_property_map_[column_path]; + if (column_prop != nullptr) + return column_prop->getKey(); + } + return NULL_STRING; +} + +FileDecryptionProperties::FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map) { + DCHECK(!footer_key.empty() || + NULLPTR != key_retriever || + 0 != column_property_map.size()); + + if (!footer_key.empty()) + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + if (footer_key.empty() && check_plaintext_footer_integrity) + DCHECK(NULLPTR != key_retriever); + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_property_map_ = column_property_map; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::withFooterKeyID( + std::string key_id) { + //key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) + throw ParquetException("footer key id should be in UTF8 encoding"); + + if (key_id.empty()) + return this; + + return withFooterKeyMetadata(key_id); +} + +std::shared_ptr FileEncryptionProperties::getColumnProperties( + const std::shared_ptr& column_path) { + if (column_property_map_.size () == 0){ + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder (column_path)); + return builder->build(); + } + if (column_property_map_.find(column_path) != column_property_map_.end()) + return column_property_map_[column_path]; + + return NULLPTR; +} + +FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, + bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map) +: footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_property_map_(column_property_map){ + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 + || footer_key.length() == 24 + || footer_key.length() == 32); + + uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; + memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); + RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + std::string aad_file_unique_str( + reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH) ; + + bool supply_aad_prefix = false; + if (aad_prefix.empty()) + file_AAD_ = aad_file_unique_str; + else { + file_AAD_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } +} + +} // namespace parquet diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h new file mode 100644 index 000000000000..f3ed589260f1 --- /dev/null +++ b/cpp/src/parquet/encryption_properties.h @@ -0,0 +1,498 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_PROPERTIES_H +#define PARQUET_ENCRYPTION_PROPERTIES_H + +#include +#include +#include + +#include "parquet/encryption.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "arrow/util/logging.h" +#include "parquet/util/visibility.h" + +namespace parquet { + +static const std::string NULL_STRING = ""; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM + = ParquetCipher::AES_GCM_V1; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; +static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; + +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class Builder { + public: + // Convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + + // Convenience builder for encrypted columns. + Builder(const std::shared_ptr& path) + : Builder(path, true) {} + + // Set a column-specific key. + // If key is not set on an encrypted column, the column will + // be encrypted with the footer key. + // keyBytes Key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + // Set a key retrieval metadata. + // use either withKeyMetaData or withKeyID, not both + Builder* withKeyMetaData(const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either withKeyMetaData or withKeyID, not both + // key_id will be converted to metadata (UTF-8 array). + Builder* withKeyID(std::string key_id); + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, + column_path_, + key_, + key_metadata_)); + } + + private: + const std::shared_ptr column_path_; + bool encrypted_; + std::string key_; + std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} + }; + + const std::shared_ptr& getPath() { return column_path_; } + bool isEncrypted() const { return encrypted_; } + bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } + const std::string& getKey() const { return key_; } + const std::string& getKeyMetaData() const { return key_metadata_; } + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + private: + const std::shared_ptr column_path_; + bool encrypted_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + explicit ColumnEncryptionProperties( + bool encrypted, + const std::shared_ptr& column_path, + const std::string& key, + const std::string& key_metadata); +}; + +class PARQUET_EXPORT ColumnDecryptionProperties { + public: + class Builder { + public: + // convenience builder for regular (not nested) columns. + Builder(const std::string name) { + Builder(schema::ColumnPath::FromDotString(name)); + } + + Builder(const std::shared_ptr& path) + : column_path_(path) {} + + // Set an explicit column key. If applied on a file that contains + // key metadata for this column the metadata will be ignored, + // the column will be decrypted with this key. + // key length must be either 16, 24 or 32 bytes. + Builder* withKey(const std::string& key) { + if (key.empty ()) + return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); + } + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& getPath() { return column_path_; } + const std::string& getKey() const { return key_; } + + private: + const std::shared_ptr column_path_; + std::string key_; + + // This class is only required for setting explicit column decryption keys - + // to override key retriever (or to provide keys when key metadata and/or + // key retriever are not available) + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, + const std::string& key); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + // Verifies identity (AAD Prefix) of individual file, + // or of file collection in a data set. + // Throws exception if an AAD prefix is wrong. + // In a data set, AAD Prefixes should be collected, + // and then checked for missing files. + virtual void check(std::string aad_prefix) = 0; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class Builder { + public: + Builder(){ + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + } + + // Set an explicit footer key. If applied on a file that contains + // footer key metadata the metadata will be ignored, the footer + // will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from + // key retriever. + // param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* withFooterKey(std::string footer_key) { + if (footer_key.empty ()) { + return this; + } + DCHECK(!footer_key.empty()); + footer_key_ = footer_key; + return this; + } + + // Set explicit column keys (decryption properties). + // Its also possible to set a key retriever on this property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* withColumnKeys(const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_properties) { + if (column_properties.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = column_properties; + return this; + } + + // Set a key retriever callback. Its also possible to + // set explicit footer or column keys on this file property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* withKeyRetriever(const std::shared_ptr& + key_retriever) { + if (key_retriever == NULLPTR) + return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; + } + + // Skip integrity verification of plaintext footers. + // If not called, integrity of plaintext footers will be checked in runtime, + // and an exception will be thrown in the following situations: + // - footer signing key is not available + // (not passed, or not found by key retriever) + // - footer content and signature don't match + Builder* withoutFooterSignatureVerification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + // Explicitly supply the file AAD prefix. + // A must when a prefix is used for file encryption, but not stored in file. + // If AAD prefix is stored in file, it will be compared to the explicitly + // supplied value and an exception will be thrown if they differ. + Builder* withAADPrefix(std::string aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; + } + + // Set callback for verification of AAD Prefixes stored in file. + Builder* withAADPrefixVerifier( + std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) + return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new FileDecryptionProperties(footer_key_, + key_retriever_, + check_plaintext_footer_integrity_, + aad_prefix_, + aad_prefix_verifier_, + column_property_map_)); + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + }; + + const std::string& getColumnKey( + const std::shared_ptr& column_path); + + const std::string& getFooterKey() { + return footer_key_; + } + + const std::string& getAADPrefix() { return aad_prefix_; } + std::shared_ptr getKeyRetriever() { + return key_retriever_; + } + + bool checkFooterIntegrity() { + return check_plaintext_footer_integrity_; + } + + const std::shared_ptr &getAADPrefixVerifier() { + return aad_prefix_verifier_; + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + std::string aad_prefix, + std::shared_ptr aad_prefix_verifier, + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map); +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + class Builder { + public: + Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + // Create files with plaintext footer. + // If not called, the files will be created with encrypted footer (default). + Builder* withPlaintextFooter() { + encrypted_footer_ = false; + return this; + } + + // Set encryption algorithm. + // If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyID(std::string key_id); + + // Set a key retrieval metadata. + // use either withFooterKeyMetaData or withFooterKeyID, not both. + Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) + return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; + } + + // Set the file AAD Prefix. + Builder* withAADPrefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) + return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; + } + + // Skip storing AAD Prefix in file. + // If not called, and if AAD Prefix is set, it will be stored. + Builder* withoutAADPrefixStorage() { + store_aad_prefix_in_file_ = false; + return this; + } + + // Set the list of encrypted columns and their properties (keys etc). + // If not called, all columns will be encrypted with the footer key. + // If called, the file columns not in the list will be left unencrypted. + Builder* withEncryptedColumns( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + encryptedColumns){ + if (encryptedColumns.size () == 0) + return this; + + if (column_property_map_.size () != 0) + throw ParquetException("Column properties already set"); + + column_property_map_ = encryptedColumns; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new FileEncryptionProperties(parquet_cipher_, + footer_key_, + footer_key_metadata_, + encrypted_footer_, + aad_prefix_, + store_aad_prefix_in_file_, + column_property_map_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + }; + bool encryptedFooter() const { return encrypted_footer_; } + + const EncryptionAlgorithm getAlgorithm() { + return algorithm_; + } + + const std::string& getFooterEncryptionKey() { + return (encrypted_footer_? footer_key_ : NULL_STRING); + } + + const std::string& getFooterEncryptionKeyMetadata() { + return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); + } + + const std::string& getFooterSigningKey() { + return (encrypted_footer_? NULL_STRING : footer_key_); + } + + const std::string& getFooterSigningKeyMetadata() { + return (encrypted_footer_? NULL_STRING : footer_key_metadata_); + } + + const std::string& getFileAAD() const { return file_AAD_; } + + std::shared_ptr getColumnProperties( + const std::shared_ptr& column_path); + + private: + EncryptionAlgorithm algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + bool encrypted_footer_; + std::string file_AAD_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> column_property_map_; + + FileEncryptionProperties(ParquetCipher::type cipher, + std::string footer_key, + std::string footer_key_metadata, + bool encrypted_footer, + const std::string& aad_prefix, + bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& + column_property_map); +}; + +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_PROPERTIES_H diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c93a435e8233..a9ebdb5f00cc 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -23,12 +23,8 @@ #include #include -#include -#include - #include "arrow/type.h" -#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" @@ -36,6 +32,8 @@ #include "parquet/types.h" #include "arrow/util/logging.h" #include "arrow/util/utf8.h" +#include "parquet/encryption_properties.h" + namespace parquet { struct ParquetVersion { @@ -44,369 +42,6 @@ struct ParquetVersion { static int64_t DEFAULT_BUFFER_SIZE = 1024; static bool DEFAULT_USE_BUFFERED_STREAM = false; -static constexpr bool DEFAULT_CHECK_SIGNATURE = true; -static const std::string NULL_STRING = ""; - -class PARQUET_EXPORT ColumnEncryptionProperties { - public: - class Builder { - public: - // Convenience builder for regular (not nested) columns. - Builder(const std::string name) { - Builder(schema::ColumnPath::FromDotString(name), true); - } - - // Convenience builder for encrypted columns. - Builder(const std::shared_ptr& path) - : Builder(path, true) {} - - // Set a column-specific key. - // If key is not set on an encrypted column, the column will - // be encrypted with the footer key. - // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { - if (key.empty ()) - return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - // Set a key retrieval metadata. - // use either withKeyMetaData or withKeyID, not both - Builder* withKeyMetaData(const std::string& key_metadata) { - DCHECK(!key_metadata.empty()); - DCHECK(key_metadata_.empty()); - key_metadata_ = key_metadata; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either withKeyMetaData or withKeyID, not both - // key_id will be converted to metadata (UTF-8 array). - Builder* withKeyID(std::string key_id) { - //key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t *data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) - throw ParquetException("key id should be in UTF8 encoding"); - - DCHECK(!key_id.empty()); - this->withKeyMetaData(key_id); - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnEncryptionProperties(encrypted_, - column_path_, - key_, - key_metadata_)); - } - - private: - const std::shared_ptr column_path_; - bool encrypted_; - std::string key_; - std::string key_metadata_; - - Builder(const std::shared_ptr& path, bool encrypted) - : column_path_(path), encrypted_(encrypted) {} - }; - const std::shared_ptr& getPath() { return column_path_; } - bool isEncrypted() const { return encrypted_; } - bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } - const std::string& getKey() const { return key_; } - const std::string& getKeyMetaData() const { return key_metadata_; } - - ColumnEncryptionProperties() = default; - ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; - ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - - private: - const std::shared_ptr column_path_; - bool encrypted_; - bool encrypted_with_footer_key_; - std::string key_; - std::string key_metadata_; - explicit ColumnEncryptionProperties( - bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata):column_path_(column_path){ - DCHECK(column_path != nullptr); - if (!encrypted) - DCHECK(key.empty() && key_metadata.empty()); - - if (!key.empty()) - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - - encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_) - DCHECK(key_metadata.empty()); - - encrypted_ = encrypted; - key_metadata_ = key_metadata; - key_ = key; - } -}; - -class PARQUET_EXPORT ColumnDecryptionProperties { - public: - class Builder { - public: - // convenience builder for regular (not nested) columns. - Builder(const std::string name) { - Builder(schema::ColumnPath::FromDotString(name)); - } - - Builder(const std::shared_ptr& path) - : column_path_(path) {} - - // Set an explicit column key. If applied on a file that contains - // key metadata for this column the metadata will be ignored, - // the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { - if (key.empty ()) - return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_)); - } - - private: - const std::shared_ptr column_path_; - std::string key_; - }; - - ColumnDecryptionProperties() = default; - ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; - ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - - const std::shared_ptr& getPath() { return column_path_; } - const std::string& getKey() const { return key_; } - - private: - const std::shared_ptr column_path_; - std::string key_; - - // This class is only required for setting explicit column decryption keys - - // to override key retriever (or to provide keys when key metadata and/or - // key retriever are not available) - explicit ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ - DCHECK(column_path != nullptr); - - if (!key.empty()) - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - - key_ = key; - } -}; - -class PARQUET_EXPORT AADPrefixVerifier { - public: - // Verifies identity (AAD Prefix) of individual file, - // or of file collection in a data set. - // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, - // and then checked for missing files. - virtual void check(std::string aad_prefix) = 0; -}; - -class PARQUET_EXPORT FileDecryptionProperties { - public: - class Builder { - public: - Builder(){ - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - } - - // Set an explicit footer key. If applied on a file that contains - // footer key metadata the metadata will be ignored, the footer - // will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from - // key retriever. - // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* withFooterKey(std::string footer_key) { - if (footer_key.empty ()) { - return this; - } - DCHECK(!footer_key.empty()); - footer_key_ = footer_key; - return this; - } - - // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* withColumnKeys(const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties) { - if (column_properties.size () == 0) - return this; - - if (column_property_map_.size () != 0) - throw ParquetException("Column properties already set"); - - column_property_map_ = column_properties; - return this; - } - - // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* withKeyRetriever(const std::shared_ptr& - key_retriever) { - if (key_retriever == NULLPTR) - return this; - - DCHECK(key_retriever_ == NULLPTR); - key_retriever_ = key_retriever; - return this; - } - - // Skip integrity verification of plaintext footers. - // If not called, integrity of plaintext footers will be checked in runtime, - // and an exception will be thrown in the following situations: - // - footer signing key is not available - // (not passed, or not found by key retriever) - // - footer content and signature don't match - Builder* withoutFooterSignatureVerification() { - check_plaintext_footer_integrity_ = false; - return this; - } - - // Explicitly supply the file AAD prefix. - // A must when a prefix is used for file encryption, but not stored in file. - // If AAD prefix is stored in file, it will be compared to the explicitly - // supplied value and an exception will be thrown if they differ. - Builder* withAADPrefix(std::string aad_prefix) { - if (aad_prefix.empty()) { - return this; - } - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - return this; - } - - // Set callback for verification of AAD Prefixes stored in file. - Builder* withAADPrefixVerifier( - std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) - return this; - - DCHECK(aad_prefix_verifier_ == NULLPTR); - aad_prefix_verifier_ = aad_prefix_verifier; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new FileDecryptionProperties(footer_key_, - key_retriever_, - check_plaintext_footer_integrity_, - aad_prefix_, - aad_prefix_verifier_, - column_property_map_)); - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - }; - - const std::string& getColumnKey( - const std::shared_ptr& column_path) { - if (column_property_map_.find(column_path) != column_property_map_.end()) { - auto column_prop = column_property_map_[column_path]; - if (column_prop != nullptr) - return column_prop->getKey(); - } - return NULL_STRING; - } - - const std::string& getFooterKey() { - return footer_key_; - } - - const std::string& getAADPrefix() { return aad_prefix_; } - - std::shared_ptr getKeyRetriever() { - return key_retriever_; - } - - bool checkFooterIntegrity() { - return check_plaintext_footer_integrity_; - } - - const std::shared_ptr &getAADPrefixVerifier() { - return aad_prefix_verifier_; - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - - - FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - std::string aad_prefix, - std::shared_ptr aad_prefix_verifier, - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { - DCHECK(!footer_key.empty() || - NULLPTR != key_retriever || - 0 != column_property_map.size()); - if (!footer_key.empty()) - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); - if (footer_key.empty() && check_plaintext_footer_integrity) - DCHECK(NULLPTR != key_retriever); - aad_prefix_verifier_ = aad_prefix_verifier; - footer_key_ = footer_key; - check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; - key_retriever_ = key_retriever; - aad_prefix_ = aad_prefix; - column_property_map_ = column_property_map; - } - -}; class PARQUET_EXPORT ReaderProperties { public: @@ -475,10 +110,6 @@ static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = ParquetVersion::PARQUET_1_0; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; -static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; -static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; class PARQUET_EXPORT ColumnProperties { public: @@ -527,208 +158,6 @@ class PARQUET_EXPORT ColumnProperties { size_t max_stats_size_; }; -class PARQUET_EXPORT FileEncryptionProperties { - public: - class Builder { - public: - Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { - footer_key_ = footer_key; - store_aad_prefix_in_file_ = false; - } - - // Create files with plaintext footer. - // If not called, the files will be created with encrypted footer (default). - Builder* withPlaintextFooter() { - encrypted_footer_ = false; - return this; - } - - // Set encryption algorithm. - // If not called, files will be encrypted with AES_GCM_V1 (default). - Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { - parquet_cipher_ = parquet_cipher; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyID(std::string key_id) { - //key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) - throw ParquetException("footer key id should be in UTF8 encoding"); - - if (key_id.empty()) - return this; - - return withFooterKeyMetadata(key_id); - } - - // Set a key retrieval metadata. - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) - return this; - - DCHECK(footer_key_metadata_.empty()); - footer_key_metadata_ = footer_key_metadata; - return this; - } - - // Set the file AAD Prefix. - Builder* withAADPrefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) - return this; - - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - store_aad_prefix_in_file_ = true; - return this; - } - - // Skip storing AAD Prefix in file. - // If not called, and if AAD Prefix is set, it will be stored. - Builder* withoutAADPrefixStorage() { - store_aad_prefix_in_file_ = false; - return this; - } - - // Set the list of encrypted columns and their properties (keys etc). - // If not called, all columns will be encrypted with the footer key. - // If called, the file columns not in the list will be left unencrypted. - Builder* withEncryptedColumns( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - encryptedColumns){ - if (encryptedColumns.size () == 0) - return this; - - if (column_property_map_.size () != 0) - throw ParquetException("Column properties already set"); - - column_property_map_ = encryptedColumns; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new FileEncryptionProperties(parquet_cipher_, - footer_key_, - footer_key_metadata_, - encrypted_footer_, - aad_prefix_, - store_aad_prefix_in_file_, - column_property_map_)); - } - - private: - ParquetCipher::type parquet_cipher_; - bool encrypted_footer_; - std::string footer_key_; - std::string footer_key_metadata_; - - std::string aad_prefix_; - bool store_aad_prefix_in_file_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - }; - - bool encryptedFooter() const { return encrypted_footer_; } - - const EncryptionAlgorithm getAlgorithm() { - return algorithm_; - } - - const std::string& getFooterEncryptionKey() { - return (encrypted_footer_? footer_key_ : NULL_STRING); - } - - const std::string& getFooterEncryptionKeyMetadata() { - return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); - } - - const std::string& getFooterSigningKey() { - return (encrypted_footer_? NULL_STRING : footer_key_); - } - - const std::string& getFooterSigningKeyMetadata() { - return (encrypted_footer_? NULL_STRING : footer_key_metadata_); - } - - const std::string& getFileAAD() const { return file_AAD_; } - - std::shared_ptr getColumnProperties( - const std::shared_ptr& column_path) { - if (column_property_map_.size () == 0){ - auto builder = std::shared_ptr( - new ColumnEncryptionProperties::Builder (column_path)); - return builder->build(); - } - if (column_property_map_.find(column_path) != column_property_map_.end()) - return column_property_map_[column_path]; - - return NULLPTR; - } - - private: - EncryptionAlgorithm algorithm_; // encryption algorithm - std::string footer_key_; - std::string footer_key_metadata_; - bool encrypted_footer_; - std::string file_AAD_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; - - FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, - bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_property_map) - : footer_key_(footer_key), - footer_key_metadata_(footer_key_metadata), - encrypted_footer_(encrypted_footer), - column_property_map_(column_property_map){ - DCHECK(!footer_key.empty()); - // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 - || footer_key.length() == 24 - || footer_key.length() == 32); - - uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; - memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); - RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str( - reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH) ; - - bool supply_aad_prefix = false; - if (aad_prefix.empty()) - file_AAD_ = aad_file_unique_str; - else { - file_AAD_ = aad_prefix + aad_file_unique_str; - if (!store_aad_prefix_in_file) supply_aad_prefix = true; - } - algorithm_.algorithm = cipher; - algorithm_.aad.aad_file_unique = aad_file_unique_str; - algorithm_.aad.supply_aad_prefix = supply_aad_prefix; - if (!aad_prefix.empty() && store_aad_prefix_in_file) { - algorithm_.aad.aad_prefix = aad_prefix; - } - } -}; - class PARQUET_EXPORT WriterProperties { public: class Builder { From 4d4aef1291afff175eeb80e54d54b31233000dac Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 25 Apr 2019 23:01:31 +0700 Subject: [PATCH 031/201] update crypto API change --- cpp/src/parquet/CMakeLists.txt | 2 + cpp/src/parquet/column_reader.cc | 91 ++++---- cpp/src/parquet/column_reader.h | 6 +- cpp/src/parquet/column_writer.cc | 104 +++++---- cpp/src/parquet/column_writer.h | 6 +- cpp/src/parquet/file_reader.cc | 162 ++++---------- cpp/src/parquet/file_writer.cc | 76 ++++--- cpp/src/parquet/file_writer.h | 2 +- cpp/src/parquet/internal_file_decryptor.cc | 246 +++++++++++++++++++++ cpp/src/parquet/internal_file_decryptor.h | 116 ++++++++++ cpp/src/parquet/internal_file_encryptor.cc | 144 ++++++++++++ cpp/src/parquet/internal_file_encryptor.h | 61 +++++ cpp/src/parquet/metadata.cc | 180 +++++---------- cpp/src/parquet/metadata.h | 34 ++- cpp/src/parquet/thrift.h | 27 ++- 15 files changed, 864 insertions(+), 393 deletions(-) create mode 100644 cpp/src/parquet/internal_file_decryptor.cc create mode 100644 cpp/src/parquet/internal_file_decryptor.h create mode 100644 cpp/src/parquet/internal_file_encryptor.cc create mode 100644 cpp/src/parquet/internal_file_encryptor.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 86fe61d6af00..820934230669 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -169,6 +169,8 @@ set(PARQUET_SRCS encoding.cc encryption.cc encryption_properties.cc + internal_file_decryptor.cc + internal_file_encryptor.cc file_reader.cc file_writer.cc metadata.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ccaa215c8b71..fc4c0d8fbafd 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -38,6 +38,7 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" // IWYU pragma: keep @@ -126,8 +127,9 @@ class SerializedPageReader : public PageReader { int64_t total_num_rows, Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, int16_t column_ordinal, - const std::shared_ptr encryption, - ::arrow::MemoryPool* pool) + ::arrow::MemoryPool* pool, + std::shared_ptr meta_decryptor, + std::shared_ptr data_decryptor) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), first_page_(true), @@ -137,20 +139,24 @@ class SerializedPageReader : public PageReader { page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), - encryption_(encryption), - decryption_buffer_(AllocateBuffer(pool, 0)) { + decryption_buffer_(AllocateBuffer(pool, 0)), + meta_decryptor_(meta_decryptor), + data_decryptor_(data_decryptor) { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); - if (encryption != NULLPTR) { - DCHECK (!encryption_->fileAAD().empty()); + if (data_decryptor_ != NULLPTR) { + DCHECK (!data_decryptor_->fileAAD().empty()); //prepare the AAD for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + data_decryptor_->fileAAD(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); + } + if (meta_decryptor_ != NULLPTR) { + DCHECK (!meta_decryptor_->fileAAD().empty()); data_page_headerAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + meta_decryptor_->fileAAD(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -190,8 +196,9 @@ class SerializedPageReader : public PageReader { int64_t total_num_rows_; // Encryption - std::shared_ptr encryption_; std::shared_ptr decryption_buffer_; + std::shared_ptr meta_decryptor_; + std::shared_ptr data_decryptor_; }; std::shared_ptr SerializedPageReader::NextPage() { @@ -226,20 +233,23 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - if (encryption_!= NULLPTR) { - if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); - encryption_->aad(aad); - } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - encryption_->aad(data_page_headerAAD_); - } - } - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, encryption_); + if (meta_decryptor_ != NULLPTR) { + if (current_page_is_dictionary) { + aad = parquet_encryption::createModuleAAD( + meta_decryptor_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1); + meta_decryptor_->aad(aad); + } else { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, + page_ordinal_); + meta_decryptor_->aad(data_page_headerAAD_); + } + } + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_, + meta_decryptor_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -257,18 +267,18 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - if (encryption_!= NULLPTR){ - DCHECK(!encryption_->fileAAD().empty()); + if (data_decryptor_ != NULLPTR){ + DCHECK(!data_decryptor_->fileAAD().empty()); if (current_page_is_dictionary){ aad = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + data_decryptor_->fileAAD(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); - encryption_->aad(aad); + data_decryptor_->aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - encryption_->aad(data_pageAAD_); + data_decryptor_->aad(data_pageAAD_); } } @@ -283,10 +293,10 @@ std::shared_ptr SerializedPageReader::NextPage() { } // Decrypt it if we need to - if (encryption_ != nullptr) { - decryption_buffer_->Resize(encryption_->CalculatePlainSize(compressed_len), false); - compressed_len = parquet_encryption::Decrypt( - encryption_, false, buffer, compressed_len, decryption_buffer_->mutable_data()); + if (data_decryptor_ != nullptr) { + decryption_buffer_->Resize(compressed_len - data_decryptor_->CiphertextSizeDelta()); + compressed_len = data_decryptor_->Decrypt( + buffer, compressed_len, decryption_buffer_->mutable_data()); buffer = decryption_buffer_->data(); } @@ -357,15 +367,18 @@ std::shared_ptr SerializedPageReader::NextPage() { return std::shared_ptr(nullptr); } -std::unique_ptr PageReader::Open( - const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, bool column_has_dictionary, - int16_t row_group_ordinal, int16_t column_ordinal, - const std::shared_ptr encryption, - ::arrow::MemoryPool* pool) { +std::unique_ptr PageReader::Open(const std::shared_ptr& stream, + int64_t total_num_rows, + Compression::type codec, + bool column_has_dictionary, + int16_t row_group_ordinal, + int16_t column_ordinal, + ::arrow::MemoryPool* pool, + std::shared_ptr meta_decryptor, + std::shared_ptr data_decryptor) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, - row_group_ordinal, column_ordinal, encryption, pool)); + row_group_ordinal, column_ordinal, pool, meta_decryptor, data_decryptor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 81479fdd7c36..8e07e9a01044 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -44,6 +44,7 @@ class RleDecoder; namespace parquet { class Page; +class Decryptor; // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -82,8 +83,9 @@ class PARQUET_EXPORT PageReader { const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, bool column_has_dictionary = false, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, - const std::shared_ptr& encryption = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + std::shared_ptr meta_decryptor = NULLPTR, + std::shared_ptr data_decryptor = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 6053e56d376f..d855e0cf3c10 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -40,6 +40,7 @@ #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/thrift.h" @@ -138,10 +139,11 @@ class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, - MemoryPool* pool = arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), pool_(pool), @@ -153,16 +155,19 @@ class SerializedPageWriter : public PageWriter { page_ordinal_(0), row_group_ordinal_(row_group_ordinal), column_ordinal_(column_chunk_ordinal), - encryption_(encryption){ - if (encryption != NULLPTR) { + meta_encryptor_(meta_encryptor), + data_encryptor_(data_encryptor){ + if (data_encryptor_ != NULLPTR) { //prepare the add for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + data_encryptor_->fileAAD(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); + } + if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = parquet_encryption::createModuleAAD( - encryption_->fileAAD(), + meta_encryptor_->fileAAD(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -192,24 +197,17 @@ class SerializedPageWriter : public PageWriter { int32_t output_data_len = static_cast(compressed_data->size()); std::shared_ptr encrypted_data_buffer = nullptr; - if (encryption_.get()) { - encryption_->aad( - parquet_encryption::createModuleAAD(encryption_->fileAAD(), + if (data_encryptor_.get()) { + data_encryptor_->aad( + parquet_encryption::createModuleAAD(data_encryptor_->fileAAD(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1)); - encrypted_data_buffer = std::static_pointer_cast( - AllocateBuffer(pool_, encryption_->CalculateCipherSize(output_data_len))); - output_data_len = parquet_encryption::Encrypt( - encryption_, false, compressed_data->data(), output_data_len, - encrypted_data_buffer->mutable_data()); + AllocateBuffer(pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - encryption_->aad( - parquet_encryption::createModuleAAD(encryption_->fileAAD(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); } format::PageHeader page_header; @@ -224,7 +222,15 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); + + if (meta_encryptor_) { + meta_encryptor_->aad( + parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), + parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + } + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -240,16 +246,15 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); - auto props = metadata_->get_encryption_props(metadata_->descr()->path()); - if (props != nullptr){ - props->aad( - parquet_encryption::createModuleAAD(encryption_->fileAAD(), + if (meta_encryptor_ != nullptr){ + meta_encryptor_->aad( + parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), parquet_encryption::ColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get(), props); + metadata_->WriteTo(sink_.get(), meta_encryptor_); } /** @@ -289,17 +294,14 @@ class SerializedPageWriter : public PageWriter { int32_t output_data_len = static_cast(compressed_data->size()); std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); - if (encryption_.get()) { + if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - encryption_->aad(data_pageAAD_); - encrypted_data_buffer->Resize(encryption_->CalculateCipherSize(output_data_len)); - output_data_len = parquet_encryption::Encrypt( - encryption_, false, compressed_data->data(), output_data_len, + data_encryptor_->aad(data_pageAAD_); + encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + output_data_len); + output_data_len = data_encryptor_->Encrypt( + compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, - page_ordinal_); - encryption_->aad(data_page_headerAAD_); } format::PageHeader page_header; @@ -315,7 +317,12 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), encryption_); + if (meta_encryptor_) { + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, + page_ordinal_); + meta_encryptor_->aad(data_page_headerAAD_); + } + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -359,7 +366,9 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr compressor_; - std::shared_ptr encryption_; + + std::shared_ptr meta_encryptor_; + std::shared_ptr data_encryptor_; }; // This implementation of the PageWriter writes to the final sink on Close . @@ -367,17 +376,19 @@ class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t current_column_ordinal, - MemoryPool* pool = arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool(), + std::shared_ptr meta_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, encryption, metadata, - row_group_ordinal, current_column_ordinal, pool)); - } // TODO: nullptr for EncryptionProperties + new SerializedPageWriter(in_memory_sink_, codec, metadata, + row_group_ordinal, current_column_ordinal, pool, + meta_encryptor, data_encryptor)); + } int64_t WriteDictionaryPage(const DictionaryPage& page) override { return pager_->WriteDictionaryPage(page); @@ -420,19 +431,20 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group) { + int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group, + std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor) { if (buffered_row_group) { return std::unique_ptr( - new BufferedPageWriter(sink, codec, encryption, metadata, + new BufferedPageWriter(sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, - pool)); + pool, meta_encryptor, data_encryptor)); } else { return std::unique_ptr( - new SerializedPageWriter(sink, codec, encryption, metadata, + new SerializedPageWriter(sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, - pool)); + pool, meta_encryptor, data_encryptor)); } } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index d190c503a858..38cfdfd241da 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -47,6 +47,7 @@ class CompressedDataPage; class DictionaryPage; class ColumnChunkMetaDataBuilder; class WriterProperties; +class Encryptor; class PARQUET_EXPORT LevelEncoder { public: @@ -84,11 +85,12 @@ class PARQUET_EXPORT PageWriter { static std::unique_ptr Open( const std::shared_ptr& sink, Compression::type codec, - const std::shared_ptr& encryption, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool buffered_row_group = false); + bool buffered_row_group = false, + std::shared_ptr header_encryptor = NULLPTR, + std::shared_ptr data_encryptor = NULLPTR); // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 7914569eaf13..38af51c218fc 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -33,6 +33,7 @@ #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -86,12 +87,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, FileCryptoMetaData* file_crypto_metadata, int row_group_number, - const ReaderProperties& props) + const ReaderProperties& props, InternalFileDecryptor* file_decryptor) : source_(source), file_metadata_(file_metadata), file_crypto_metadata_(file_crypto_metadata), properties_(props), - row_group_ordinal_((int16_t)row_group_number){ + row_group_ordinal_((int16_t)row_group_number), + file_decryptor_(file_decryptor){ row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -107,15 +109,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { else if (file_metadata_->is_plaintext_mode()) { algorithm = file_metadata_->encryption_algorithm(); } - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> - column_map = properties_.column_map(); // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, properties_.file_decryption(), - &algorithm, - properties_.fileAAD(), - column_map); + &algorithm, file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -152,8 +149,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i/*column_ordinal*/, - nullptr, properties_.memory_pool()); + row_group_ordinal_, + (int16_t)i/* column_ordinal */, + properties_.memory_pool()); } // the column is encrypted @@ -163,38 +161,25 @@ class SerializedRowGroup : public RowGroupReader::Contents { row_group_ordinal_, (int16_t)i, (int16_t)-1); - auto file_decryption = properties_.file_decryption(); - // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() ? file_metadata_->footer_signing_key_metadata() : file_crypto_metadata_->key_metadata(); - std::string footer_key = file_decryption->getFooterKey(); - // ignore footer key metadata if footer key is explicitly set via API - if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - footer_key = file_decryption->getKeyRetriever()->GetKey( - footer_key_metadata); - } - if (footer_key.empty()) { - throw ParquetException("column is encrypted with null footer key"); - } ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; - auto footer_encryption = std::make_shared( - algorithm, footer_key, properties_.fileAAD(), aad); + auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( + algorithm, footer_key_metadata, aad); + auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( + algorithm, footer_key_metadata, aad); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - footer_encryption, properties_.memory_pool()); + col->has_dictionary_page(), row_group_ordinal_, + (int16_t)i, properties_.memory_pool(), + meta_decryptor, data_decryptor); } // file is non-uniform encrypted and the column @@ -203,41 +188,21 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - // encrypted with column key - std::string column_key; - // first look if we already got the key from before - if (column_map != NULLPTR - && column_map->find(column_path) != column_map->end()) { - column_key = column_map->at(column_path); - } else { - column_key = file_decryption->getColumnKey(column_path); - // No explicit column key given via API. Retrieve via key metadata. - if (column_key.empty() && !column_key_metadata.empty() && - file_decryption->getKeyRetriever() != nullptr){ - try { - column_key = file_decryption->getKeyRetriever()->GetKey( - column_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << "HiddenColumnException, path=" + - column_path->ToDotString() + " " - << e.what() << "\n"; - throw HiddenColumnException(ss.str()); - } - } - } - if (column_key.empty()) { - throw HiddenColumnException("column is encrypted with null key, path=" + - column_path->ToDotString()); - } - auto column_encryption = std::make_shared( - file_crypto_metadata_->encryption_algorithm().algorithm, - column_key, - properties_.fileAAD(), aad); - return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - column_encryption, properties_.memory_pool()); + auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( + column_path, + file_crypto_metadata_->encryption_algorithm().algorithm, + column_key_metadata, aad); + auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( + column_path, + file_crypto_metadata_->encryption_algorithm().algorithm, + column_key_metadata, aad); + + return PageReader::Open(stream, col->num_values(), + col->compression(), + col->has_dictionary_page(), row_group_ordinal_, + (int16_t)i, properties_.memory_pool(), + meta_decryptor, data_decryptor); } private: @@ -247,6 +212,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; + InternalFileDecryptor* file_decryptor_; }; // ---------------------------------------------------------------------- @@ -266,7 +232,8 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents( new SerializedRowGroup(source_, file_metadata_.get(), - file_crypto_metadata_.get(), i, properties_)); + file_crypto_metadata_.get(), i, properties_, + file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -332,6 +299,7 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_metadata_->is_plaintext_mode()) { auto file_decryption = properties_.file_decryption(); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); if (file_decryption == nullptr) { throw ParquetException("No decryption properties are provided"); } @@ -364,41 +332,19 @@ class SerializedFile : public ParquetFileReader::Contents { else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; - properties_.set_fileAAD(fileAAD); + file_decryptor_->file_aad(fileAAD); if (file_decryption->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); } - // get footer key std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); - std::string footer_key = file_decryption->getFooterKey(); - // ignore footer key metadata if footer key is explicitly set via API - if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = - file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << "Footer key: access denied " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - } - if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify " - "plaintext footer metadata"); - } - std::string aad = parquet_encryption::createFooterAAD(fileAAD); - auto encryption = std::make_shared( + auto encryptor = file_decryptor_->GetFooterSigningEncryptor( file_metadata_->encryption_algorithm().algorithm, - footer_key, fileAAD, aad); - if (! file_metadata_->verify(encryption, metadata_buffer->data() - + read_metadata_len, 28)) { + footer_key_metadata); + if (! file_metadata_->verify(encryptor, metadata_buffer->data() + + read_metadata_len)) { throw ParquetException("Invalid parquet file. Could not verify plaintext" " footer metadata"); } @@ -436,6 +382,7 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided. Could not read " "encrypted footer metadata"); } + file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); @@ -467,8 +414,7 @@ class SerializedFile : public ParquetFileReader::Contents { else fileAAD = file_decryption->getAADPrefix() + aad_file_unique; // save fileAAD for later use - properties_.set_fileAAD(fileAAD); - std::string aad = parquet_encryption::createFooterAAD(fileAAD); + file_decryptor_->file_aad(fileAAD); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; @@ -482,32 +428,13 @@ class SerializedFile : public ParquetFileReader::Contents { // get footer key metadata std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); - std::string footer_key = file_decryption->getFooterKey(); - if (footer_key.empty()) { - if (footer_key_metadata.empty()) - throw ParquetException("No footer key or key metadata"); - if (file_decryption->getKeyRetriever() == nullptr) - throw ParquetException("No footer key or key retriever"); - try { - footer_key = - file_decryption->getKeyRetriever()->GetKey(footer_key_metadata); - } catch (KeyAccessDeniedException &e) { - std::stringstream ss; - ss << "Footer key: access denied " << e.what() << "\n";; - throw ParquetException(ss.str()); - } - } - if (footer_key.empty()) { - throw ParquetException("Invalid footer encryption key. " - "Could not parse footer metadata"); - } - auto footer_encryption = std::make_shared( + + auto footer_decryptor = file_decryptor_->GetFooterDecryptor( file_crypto_metadata_->encryption_algorithm().algorithm, - footer_key, - fileAAD, aad); + footer_key_metadata); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, - footer_encryption); + footer_decryptor); } } @@ -516,6 +443,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr file_metadata_; std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; + std::unique_ptr file_decryptor_; }; // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 815518d229ce..9785bb250c3a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -26,6 +26,7 @@ #include "parquet/deprecated_io.h" #include "parquet/exception.h" #include "parquet/platform.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" #include "parquet/util/memory.h" #include "parquet/util/crypto.h" @@ -86,7 +87,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, const WriterProperties* properties, - bool buffered_row_group = false) + bool buffered_row_group = false, + InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), properties_(properties), @@ -95,7 +97,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { row_group_ordinal_ (row_group_ordinal), next_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group) { + buffered_row_group_(buffered_row_group), + file_encryptor_(file_encryptor) { if (buffered_row_group) { InitColumns(); } else { @@ -131,12 +134,19 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++next_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); + auto meta_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; + std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, row_group_ordinal_, (int16_t)(current_column_index_-1), - properties_->memory_pool()); + properties_->memory_pool(), false, + meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -205,6 +215,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { int next_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; + InternalFileEncryptor* file_encryptor_; void CheckRowsWritten() const { // verify when only one column is written at a time @@ -232,12 +243,18 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); + auto meta_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; std::unique_ptr pager = PageWriter::Open(sink_, properties_->compression(column_descr->path()), - properties_->encryption(column_descr->path()), col_meta, (int16_t)row_group_ordinal_, (int16_t)current_column_index_, - properties_->memory_pool(), buffered_row_group_); + properties_->memory_pool(), buffered_row_group_, + meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -290,15 +307,8 @@ class FileSerializer : public ParquetFileWriter::Contents { auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - ParquetCipher::type algorithm = - file_encryption->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD( - file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = std::make_shared( - algorithm, - file_encryption->getFooterEncryptionKey(), - file_encryption->getFileAAD(), aad); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, true); + auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(PARQUET_EMAGIC, 4); @@ -314,14 +324,8 @@ class FileSerializer : public ParquetFileWriter::Contents { file_metadata_ = metadata_->Finish( &signing_encryption, file_encryption->getFooterSigningKeyMetadata ()); - ParquetCipher::type algorithm = algo.algorithm; - std::string aad = parquet_encryption::createFooterAAD( - file_encryption->getFileAAD()); - std::shared_ptr footer_encryption = std::make_shared( - algorithm, - file_encryption->getFooterSigningKey(), - file_encryption->getFileAAD(), aad); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryption, false); + auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); } } @@ -347,7 +351,7 @@ class FileSerializer : public ParquetFileWriter::Contents { auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, (int16_t)(num_row_groups_-1), properties_.get(), - buffered_row_group)); + buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); @@ -388,12 +392,22 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; + std::unique_ptr file_encryptor_; + void StartFile() { - if (properties_->file_encryption() == nullptr) { - // Parquet files always start with PAR1 + auto file_encryption = properties_->file_encryption(); + if (file_encryption == nullptr) { + // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); } else { - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); + if (file_encryption->encryptedFooter()) { + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + } + else { + // plaintext mode footer + PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + } } } }; @@ -431,9 +445,9 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& footer_encryption, + const std::shared_ptr& encryptor, bool encrypt_footer) { - if (footer_encryption == nullptr) { + if (encryptor == nullptr) { // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); @@ -449,11 +463,11 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin } else { if (encrypt_footer) { // encrypt and write to sink - file_metadata.WriteTo(sink, footer_encryption); + file_metadata.WriteTo(sink, encryptor); } else { uint32_t metadata_len = static_cast(sink->Tell()); - file_metadata.WriteTo(sink, footer_encryption); + file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; sink->Write(reinterpret_cast(&metadata_len), 4); diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 7932d1968beb..0aad4fc5abc2 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -95,7 +95,7 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - const std::shared_ptr& encryption_properties = NULLPTR, + const std::shared_ptr& encryptor = NULLPTR, bool encrypt_footer = false); void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc new file mode 100644 index 000000000000..91a1730a2383 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -0,0 +1,246 @@ +#include "parquet/encryption_properties.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/util/crypto.h" + +namespace parquet { + +// FooterSigningEncryptor +static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} + +FooterSigningEncryptor::FooterSigningEncryptor( + ParquetCipher::type algorithm, const std::string& key, + const std::string& file_aad, const std::string& aad) + : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { + aes_encryptor_.reset(new parquet_encryption::AesEncryptor( + algorithm, static_cast(key_.size()), true)); +} + +int FooterSigningEncryptor::CiphertextSizeDelta() { + return aes_encryptor_->CiphertextSizeDelta(); +} + +int FooterSigningEncryptor::SignedFooterEncrypt( + const uint8_t* footer, int footer_len, + uint8_t* nonce, uint8_t* encrypted_footer) { + return aes_encryptor_->SignedFooterEncrypt( + footer, footer_len, str2bytes(key_), static_cast(key_.size()), + str2bytes(aad_), static_cast(aad_.size()), nonce, encrypted_footer); +} + +// Decryptor +Decryptor::Decryptor( + parquet_encryption::AesDecryptor* aes_decryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad) + : aes_decryptor_(aes_decryptor), key_(key) + , file_aad_(file_aad), aad_(aad) {} + +int Decryptor::CiphertextSizeDelta() { + return aes_decryptor_->CiphertextSizeDelta(); +} + +int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext) { + return aes_decryptor_->Decrypt( + ciphertext, ciphertext_len, + str2bytes(key_), static_cast(key_.size()), + str2bytes(aad_), static_cast(aad_.size()), plaintext); +} + +// InternalFileDecryptor +InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) + : properties_(properties) {} + +std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) +{ + std::string footer_key = properties_->getFooterKey(); + // ignore footer key metadata if footer key is explicitly set via API + if (footer_key.empty()) { + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = + properties_->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException("Footer key unavailable. Could not verify " + "plaintext footer metadata"); + } + + std::string aad = parquet_encryption::createFooterAAD(file_aad_); + + return std::make_shared( + algorithm, footer_key, file_aad_, aad); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) { + std::string aad = parquet_encryption::createFooterAAD(file_aad_); + return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad) { + return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnData( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad) { + return GetFooterDecryptor(algorithm, footer_key_metadata, aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad, bool metadata) { + std::string footer_key = properties_->getFooterKey(); + if (footer_key.empty()) { + if (footer_key_metadata.empty()) + throw ParquetException("No footer key or key metadata"); + if (properties_->getKeyRetriever() == nullptr) + throw ParquetException("No footer key or key retriever"); + try { + footer_key = properties_->getKeyRetriever()->GetKey(footer_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "Footer key: access denied " << e.what() << "\n";; + throw ParquetException(ss.str()); + } + } + if (footer_key.empty()) { + throw ParquetException("Invalid footer encryption key. " + "Could not parse footer metadata"); + } + + auto aes_decryptor = metadata + ? GetMetaAesDecryptor(algorithm, footer_key.size()) + : GetDataAesDecryptor(algorithm, footer_key.size()); + return std::make_shared(aes_decryptor, footer_key, file_aad_, aad); +} + +std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad) { + return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, true); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad) { + return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, false); +} + +std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad, bool metadata) { + std::string column_key; + // first look if we already got the key from before + if (column_map_ != NULLPTR + && column_map_->find(column_path) != column_map_->end()) { + column_key = column_map_->at(column_path); + } else { + column_key = properties_->getColumnKey(column_path); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + properties_->getKeyRetriever() != nullptr){ + try { + column_key = properties_->getKeyRetriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException &e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + + column_path->ToDotString() + " " + << e.what() << "\n"; + throw HiddenColumnException(ss.str()); + } + } + } + if (column_key.empty()) { + throw HiddenColumnException("column is encrypted with null key, path=" + + column_path->ToDotString()); + } + + if (column_map_ != NULLPTR) { + // save column key for future use + (*column_map_)[column_path] = column_key; + } + + auto aes_decryptor = metadata + ? GetMetaAesDecryptor(algorithm, column_key.size()) + : GetDataAesDecryptor(algorithm, column_key.size()); + + return std::make_shared(aes_decryptor, column_key, file_aad_, aad); +} + +parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (meta_decryptor_128_ == NULLPTR) { + meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return meta_decryptor_128_.get(); + } + else if (key_len == 24) { + if (meta_decryptor_196_ == NULLPTR) { + meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return meta_decryptor_196_.get(); + } + else if (key_len == 32) { + if (meta_decryptor_256_ == NULLPTR) { + meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return meta_decryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (data_decryptor_128_ == NULLPTR) { + data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return data_decryptor_128_.get(); + } + else if (key_len == 24) { + if (data_decryptor_196_ == NULLPTR) { + data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return data_decryptor_196_.get(); + } + else if (key_len == 32) { + if (data_decryptor_256_ == NULLPTR) { + data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + } + return data_decryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h new file mode 100644 index 000000000000..c92fc08c8738 --- /dev/null +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -0,0 +1,116 @@ +#ifndef INTERNAL_FILE_DECRYPTOR_H +#define INTERNAL_FILE_DECRYPTOR_H + +#include +#include "parquet/schema.h" + +namespace parquet_encryption { + class AesDecryptor; + class AesEncryptor; +} + +namespace parquet { + +class FileDecryptionProperties; + +class FooterSigningEncryptor { + public: + FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, + const std::string& file_aad, const std::string& aad); + int CiphertextSizeDelta(); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, + uint8_t* nonce, uint8_t* encrypted_footer); + + private: + ParquetCipher::type algorithm_; + std::string key_; + std::string file_aad_; + std::string aad_; + + std::shared_ptr aes_encryptor_; +}; + +class Decryptor { + public: + Decryptor(parquet_encryption::AesDecryptor* decryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad); + + const std::string& fileAAD() const { return file_aad_; } + void aad(const std::string& aad) { aad_ = aad; } + + int CiphertextSizeDelta(); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); + + private: + parquet_encryption::AesDecryptor* aes_decryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; +}; + +class InternalFileDecryptor { + public: + explicit InternalFileDecryptor(FileDecryptionProperties* propperties); + + void file_aad(const std::string& file_aad) { file_aad_ = file_aad; } + std::string& file_aad() { return file_aad_; } + + std::shared_ptr GetFooterSigningEncryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); + + std::shared_ptr GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); + std::shared_ptr GetFooterDecryptorForColumnMeta( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad); + std::shared_ptr GetFooterDecryptorForColumnData( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad); + std::shared_ptr GetColumnMetaDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad); + std::shared_ptr GetColumnDataDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad); + + private: + FileDecryptionProperties* properties_; + std::string file_aad_; + std::shared_ptr, + std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; + + std::unique_ptr meta_decryptor_128_; + std::unique_ptr meta_decryptor_196_; + std::unique_ptr meta_decryptor_256_; + std::unique_ptr data_decryptor_128_; + std::unique_ptr data_decryptor_196_; + std::unique_ptr data_decryptor_256_; + + std::shared_ptr GetFooterDecryptor( + ParquetCipher::type algorithm, + const std::string& footer_key_metadata, + const std::string& aad, bool metadata); + std::shared_ptr GetColumnDecryptor( + std::shared_ptr column_path, + ParquetCipher::type algorithm, + const std::string& column_key_metadata, + const std::string& aad, bool metadata = false); + + parquet_encryption::AesDecryptor* GetMetaAesDecryptor(ParquetCipher::type algorithm, + size_t key_size); + parquet_encryption::AesDecryptor* GetDataAesDecryptor(ParquetCipher::type algorithm, + size_t key_size); +}; + +} + +#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc new file mode 100644 index 000000000000..7f2195cc20bb --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -0,0 +1,144 @@ +#include "parquet/encryption_properties.h" +#include "parquet/internal_file_encryptor.h" +#include "parquet/util/crypto.h" + +namespace parquet { + +static inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return NULLPTR; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} + +// Encryptor +Encryptor::Encryptor( + parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad) + : aes_encryptor_(aes_encryptor), key_(key) + , file_aad_(file_aad), aad_(aad) {} + +int Encryptor::CiphertextSizeDelta() { + return aes_encryptor_->CiphertextSizeDelta(); +} + +int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) { + return aes_encryptor_->Encrypt( + plaintext, plaintext_len, str2bytes(key_), static_cast(key_.size()), + str2bytes(aad_), static_cast(aad_.size()), ciphertext); +} + +// InternalFileEncryptor +InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) + : properties_(properties) {} + +std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { + ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); + std::string footer_key = properties_->getFooterEncryptionKey(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); + + return std::make_shared(aes_encryptor, footer_key, + properties_->getFileAAD(), aad); +} + +std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { + ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); + std::string footer_signing_key = properties_->getFooterSigningKey(); + auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); + + return std::make_shared(aes_encryptor, footer_signing_key, + properties_->getFileAAD(), aad); +} + +std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( + const std::shared_ptr& column_path) { + return GetColumnEncryptor(column_path, true); +} + +std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( + const std::shared_ptr& column_path) { + return GetColumnEncryptor(column_path, false); +} + +std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( + const std::shared_ptr& column_path, + bool metadata) { + auto column_prop = properties_->getColumnProperties(column_path); + if (column_prop == NULLPTR) { + return NULLPTR; + } + + std::string key; + if (column_prop->isEncryptedWithFooterKey()) { + if (properties_->encryptedFooter()) { + key = properties_->getFooterEncryptionKey(); + } else { + key = properties_->getFooterSigningKey(); + } + } + else { + key = column_prop->getKey(); + } + + ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + auto aes_encryptor = metadata + ? GetMetaAesEncryptor(algorithm, key.size()) + : GetDataAesEncryptor(algorithm, key.size()); + + std::string file_aad = properties_->getFileAAD(); + + // TODO: aad + return std::make_shared(aes_encryptor, key, file_aad, ""); +} + +parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (meta_encryptor_128_ == NULLPTR) { + meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + } + return meta_encryptor_128_.get(); + } + else if (key_len == 24) { + if (meta_encryptor_196_ == NULLPTR) { + meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + } + return meta_encryptor_196_.get(); + } + else if (key_len == 32) { + if (meta_encryptor_256_ == NULLPTR) { + meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + } + return meta_encryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( + ParquetCipher::type algorithm, size_t key_size) { + int key_len = static_cast(key_size); + if (key_len == 16) { + if (data_encryptor_128_ == NULLPTR) { + data_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + } + return data_encryptor_128_.get(); + } + else if (key_len == 24) { + if (data_encryptor_196_ == NULLPTR) { + data_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + } + return data_encryptor_196_.get(); + } + else if (key_len == 32) { + if (data_encryptor_256_ == NULLPTR) { + data_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + } + return data_encryptor_256_.get(); + } + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + +} // namespace parquet \ No newline at end of file diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h new file mode 100644 index 000000000000..2e3a3df04081 --- /dev/null +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -0,0 +1,61 @@ +#ifndef INTERNAL_FILE_ENCRYPTOR_H +#define INTERNAL_FILE_ENCRYPTOR_H + +namespace parquet_encryption { + class AesEncryptor; +} + +namespace parquet { + +class FileEncryptionProperties; + +class Encryptor { + public: + Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad); + const std::string& fileAAD() { return file_aad_; } + void aad(const std::string& aad) { aad_ = aad; } + + int CiphertextSizeDelta(); + int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); + + private: + parquet_encryption::AesEncryptor* aes_encryptor_; + std::string key_; + std::string file_aad_; + std::string aad_; +}; + +class InternalFileEncryptor { + public: + explicit InternalFileEncryptor(FileEncryptionProperties* propperties); + + std::shared_ptr GetFooterEncryptor(); + std::shared_ptr GetFooterSigningEncryptor(); + std::shared_ptr GetColumnMetaEncryptor(const std::shared_ptr& column_path); + std::shared_ptr GetColumnDataEncryptor(const std::shared_ptr& column_path); + + private: + FileEncryptionProperties* properties_; + + std::unique_ptr meta_encryptor_128_; + std::unique_ptr meta_encryptor_196_; + std::unique_ptr meta_encryptor_256_; + std::unique_ptr data_encryptor_128_; + std::unique_ptr data_encryptor_196_; + std::unique_ptr data_encryptor_256_; + + std::shared_ptr GetColumnEncryptor( + const std::shared_ptr& column_path, + bool metadata); + + parquet_encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); + parquet_encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, + size_t key_len); +}; + +} + +#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index cead71d5ac44..4944db078085 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -24,6 +24,7 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -180,9 +181,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, std::string, - schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) + InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -198,56 +197,24 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - std::string key; - if (column_map != NULLPTR - && (column_map->find(path) != column_map->end())) { - //First retrieve the key in column_map if exists - key = column_map->at(path); - } else { - key = file_decryption->getColumnKey(path); - // No explicit column key given via API. Retrieve via key metadata. - if (key.empty() && !key_metadata.empty() - && file_decryption->getKeyRetriever() != nullptr){ - try { - key = file_decryption->getKeyRetriever()->GetKey(key_metadata); - } catch (KeyAccessDeniedException &e) { - // Hidden column: encrypted, but key unavailable - std::stringstream ss; - ss << "HiddenColumnException path=" + path->ToDotString() + " " - << e.what() << "\n"; - throw HiddenColumnException(ss.str()); - } - if (key.empty ()) - throw HiddenColumnException("HiddenColumnException path=" - + path->ToDotString()); - } - if (column_map != NULLPTR) { - // save column key for future use - (*column_map)[path]=key; - } - } - if (key.empty()) { - // Hidden column: encrypted, but key unavailable - throw HiddenColumnException("HiddenColumnException path= " - + path->ToDotString()); - } + DCHECK(algorithm != NULLPTR); - + DCHECK(file_decryptor != NULLPTR); + std::string aad = parquet_encryption::createModuleAAD( - fileAAD, + file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal, column_ordinal, (int16_t)-1); - auto encryption = std::make_shared( - algorithm->algorithm, - key, fileAAD, aad); - uint32_t len = - static_cast(column->encrypted_column_metadata.size()); + auto decryptor = file_decryptor->GetColumnMetaDecryptor( + path, algorithm->algorithm, + key_metadata, aad); + uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast( column->encrypted_column_metadata.c_str()), &len, &metadata_, - encryption, + decryptor, false); } } @@ -348,14 +315,11 @@ std::unique_ptr ColumnChunkMetaData::Make( const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map) { + InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, fileAAD, - column_map)); + file_decryption, algorithm, file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData( @@ -366,9 +330,7 @@ ColumnChunkMetaData::ColumnChunkMetaData( const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, const EncryptionAlgorithm* algorithm, - const std::string& fileAAD, - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map) + InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, @@ -376,7 +338,7 @@ ColumnChunkMetaData::ColumnChunkMetaData( column_ordinal, writer_version, file_decryption, algorithm, - fileAAD, column_map))} {} + file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -460,10 +422,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR) { + InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -473,8 +432,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { return ColumnChunkMetaData::Make( &row_group_->columns[i], schema_->Column(i), row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, algorithm, fileAAD, - column_map); + writer_version_, file_decryption, algorithm, + file_decryptor); } private: @@ -507,11 +466,9 @@ const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema( std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, const std::string& fileAAD, - std::shared_ptr, std::string, - schema::ColumnPath::CmpColumnPath>> column_map) const { + const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, - fileAAD, column_map); + file_decryptor); } // file metadata @@ -521,11 +478,11 @@ class FileMetaData::FileMetaDataImpl { explicit FileMetaDataImpl( const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = nullptr) + const std::shared_ptr& decryptor = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), encryption, false); + metadata_.get(), decryptor, false); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -539,8 +496,7 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } - bool verify(std::shared_ptr encryption, - const void* tail, uint32_t tail_len) { + bool verify(std::shared_ptr encryptor, const void* tail) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -549,17 +505,16 @@ class FileMetaData::FileMetaDataImpl { // encrypt with nonce uint8_t* nonce = const_cast(reinterpret_cast(tail)); - uint8_t* tag = const_cast(reinterpret_cast(tail)) + 12; + uint8_t* tag = const_cast(reinterpret_cast(tail)) + + parquet_encryption::NonceLength; - std::vector encrypted_buffer(encryption->CalculateCipherSize(serialized_len)); + std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + serialized_len); uint32_t encrypted_len = - parquet_encryption::SignedFooterEncrypt(serialized_data, serialized_len, - encryption->key_bytes(), - encryption->key_length(), - encryption->aad_bytes(), - encryption->aad_length(), - nonce, 12, encrypted_buffer.data()); - return 0 == memcmp(encrypted_buffer.data() + encrypted_len - 16, tag, 16); + encryptor->SignedFooterEncrypt(serialized_data, serialized_len, + nonce, encrypted_buffer.data()); + return 0 == memcmp( + encrypted_buffer.data() + encrypted_len - parquet_encryption::GCMTagLength, + tag, parquet_encryption::GCMTagLength); } inline uint32_t size() const { return metadata_len_; } @@ -584,7 +539,7 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryption) const { + const std::shared_ptr& encryptor) const { ThriftSerializer serializer; if (is_plaintext_mode()) { uint8_t* serialized_data; @@ -592,19 +547,20 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); // encrypt the footer key - std::vector encrypted_data(encryption->CalculateCipherSize(serialized_len)); - unsigned encrypted_len = parquet_encryption::Encrypt( - encryption, true, serialized_data, serialized_len, encrypted_data.data()); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); + unsigned encrypted_len = encryptor->Encrypt(serialized_data, serialized_len, + encrypted_data.data()); // write unencrypted footer dst->Write(serialized_data, serialized_len); // write nonce - dst->Write(encrypted_data.data() + 4, 12); + dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); // write tag - dst->Write(encrypted_data.data() + encrypted_len - 16, 16); + dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, + parquet_encryption::GCMTagLength); } else { - serializer.Serialize(metadata_.get(), dst, encryption, false); + serializer.Serialize(metadata_.get(), dst, encryptor, false); } } @@ -692,16 +648,16 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption) { + const std::shared_ptr& decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( - new FileMetaData(metadata, metadata_len, encryption)); + new FileMetaData(metadata, metadata_len, decryptor)); } FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption) + const std::shared_ptr& decryptor) : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len, encryption))} {} + new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -712,9 +668,8 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify(std::shared_ptr encryption, - const void* tail, uint32_t tail_len) { - return impl_->verify(encryption, tail, tail_len); +bool FileMetaData::verify(std::shared_ptr encryptor, const void* tail) { + return impl_->verify(encryptor, tail); } uint32_t FileMetaData::size() const { return impl_->size(); } @@ -769,14 +724,8 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryption) const { - if (encryption != nullptr) - encryption->aad( - parquet_encryption::createModuleAAD(encryption->fileAAD(), - parquet_encryption::Footer, - (int16_t)-1, (int16_t)-1, - (int16_t)-1)); - return impl_->WriteTo(dst, encryption); + const std::shared_ptr& encryptor) const { + return impl_->WriteTo(dst, encryptor); } class FileCryptoMetaData::FileCryptoMetaDataImpl { @@ -1003,7 +952,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { + const std::shared_ptr& encryptor) { ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_props(column_->path()); @@ -1030,30 +979,28 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - auto footer_encryption = properties_->footer_encryption(); + // TODO: check file_encryption() is null or not + auto footer_key = properties_->file_encryption()->getFooterEncryptionKey(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_encryption == nullptr && encrypt_md->isEncrypted()) || + if ((footer_key.empty() && encrypt_md->isEncrypted()) || !encrypt_md->isEncryptedWithFooterKey()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata - auto encrypt_props = encryption; uint8_t* serialized_data; uint32_t serialized_len; - serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); - // encrypt the footer key - std::vector encrypted_data(encrypt_props->CalculateCipherSize(serialized_len)); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); + unsigned encrypted_len = encryptor->Encrypt( + serialized_data, serialized_len, encrypted_data.data()); - unsigned encrypted_len = parquet_encryption::Encrypt( - encrypt_props, true, serialized_data, serialized_len, encrypted_data.data()); const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Keep redacted metadata version for old readers - if (footer_encryption == nullptr) { + if (footer_key.empty()) { format::ColumnMetaData metadata_redacted = column_metadata_; if (metadata_redacted.__isset.statistics) { metadata_redacted.__isset.statistics = false; @@ -1077,11 +1024,6 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } - std::shared_ptr - get_encryption_props(const std::shared_ptr& path) { - return properties_->encryption(path); - } - const ColumnDescriptor* descr() const { return column_; } int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } @@ -1135,11 +1077,6 @@ void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); } -std::shared_ptr -ColumnChunkMetaDataBuilder::get_encryption_props(const std::shared_ptr& path) { - return impl_->get_encryption_props(path); -} - void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, @@ -1151,8 +1088,8 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption) { - impl_->WriteTo(sink, encryption); + const std::shared_ptr& encryptor) { + impl_->WriteTo(sink, encryptor); } const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { @@ -1295,7 +1232,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->footer_encryption() != nullptr) { + if (props->file_encryption() != nullptr) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } @@ -1377,7 +1314,6 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } auto file_encryption = properties_->file_encryption(); - auto footer_encryption = properties_->footer_encryption(); crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->getAlgorithm())); std::string key_metadata; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index e80f90fd175c..17401da23b67 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -38,6 +38,10 @@ class EncodedStatistics; class Statistics; class SchemaDescriptor; class FileCryptoMetaData; +class InternalFileDecryptor; +class Decryptor; +class Encryptor; +class FooterSigningEncryptor; namespace schema { @@ -125,9 +129,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string &fileAAD = "", - std::shared_ptr, - std::string, schema::ColumnPath::CmpColumnPath>> column_map = NULLPTR); + InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -160,10 +162,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string& fileAAD = "", - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR); + InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -187,11 +186,7 @@ class PARQUET_EXPORT RowGroupMetaData { std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, FileDecryptionProperties* file_decryption = NULLPTR, const EncryptionAlgorithm* algorithm = NULLPTR, - const std::string &fileAAD = "", - std::shared_ptr, - std::string, - parquet::schema::ColumnPath::CmpColumnPath>> - column_map = NULLPTR) const; + InternalFileDecryptor* file_decryptor = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -208,12 +203,12 @@ class PARQUET_EXPORT FileMetaData { // API convenience to get a MetaData accessor static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = NULLPTR); + const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); - bool verify(std::shared_ptr encryption, - const void* tail, uint32_t tail_len); + bool verify(std::shared_ptr encryptor, + const void* tail); // file metadata uint32_t size() const; @@ -232,7 +227,7 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryption = NULLPTR) const; + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -248,7 +243,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, - const std::shared_ptr& encryption = NULLPTR); + const std::shared_ptr& decryptor = NULLPTR); // PIMPL Idiom FileMetaData(); @@ -311,10 +306,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryption = NULLPTR); - std::shared_ptr - get_encryption_props(const std::shared_ptr& path); - + const std::shared_ptr& encryptor = NULLPTR); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 4189d2c90791..b7dc43be4f64 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -43,6 +43,8 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/platform.h" +#include "parquet/internal_file_encryptor.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/statistics.h" #include "parquet/util/crypto.h" @@ -200,9 +202,9 @@ using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& encryption = NULLPTR, + const std::shared_ptr& decryptor = NULLPTR, bool shouldReadLength = false) { - if (encryption == NULLPTR) { + if (decryptor == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -231,13 +233,13 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } // decrypt const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; - std::vector decrypted_buffer(encryption->CalculatePlainSize(clen, true)); - uint32_t decrypted_buffer_len = parquet_encryption::Decrypt( - encryption, true, cipherBuf, 0, decrypted_buffer.data()); + std::vector decrypted_buffer(clen - decryptor->CiphertextSizeDelta()); + uint32_t decrypted_buffer_len = decryptor->Decrypt( + cipherBuf, 0, decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } - *len = encryption->CalculateCipherSize(decrypted_buffer_len, true); + *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); @@ -273,20 +275,21 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryption = NULLPTR, + const std::shared_ptr& encryptor = NULLPTR, bool shouldWriteLength = false) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - if (encryption == NULLPTR) { + if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); } else { - std::vector cipher_buffer(encryption->CalculateCipherSize(out_length)); - unsigned cipher_buffer_len = parquet_encryption::Encrypt( - encryption, true, out_buffer, out_length, cipher_buffer.data()); - if (cipher_buffer_len > cipher_buffer.size()) { + std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); + int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, + cipher_buffer.data()); + + if (cipher_buffer_len > static_cast(cipher_buffer.size())) { std::stringstream ss; ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; throw ParquetException(ss.str()); From fbeeff24a8e247870c1b6ec9cad2af0696cf51f3 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 7 May 2019 08:53:35 +0700 Subject: [PATCH 032/201] fix issue when column is encrypted in footer plaintext mode --- cpp/src/parquet/file_reader.cc | 18 ++++++++---------- cpp/src/parquet/metadata.cc | 3 ++- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 38af51c218fc..f9fd1a4f58c8 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -160,6 +160,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { parquet_encryption::ColumnMetaData, row_group_ordinal_, (int16_t)i, (int16_t)-1); + + ParquetCipher::type footer_algorithm = file_metadata_->is_plaintext_mode() + ? file_metadata_->encryption_algorithm().algorithm + : file_crypto_metadata_->encryption_algorithm().algorithm; // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { @@ -167,14 +171,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { ? file_metadata_->footer_signing_key_metadata() : file_crypto_metadata_->key_metadata(); - ParquetCipher::type algorithm = file_metadata_->is_plaintext_mode() - ? file_metadata_->encryption_algorithm().algorithm - : file_crypto_metadata_->encryption_algorithm().algorithm; - auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( - algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( - algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad); return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), row_group_ordinal_, @@ -190,12 +190,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::make_shared(crypto_metadata->path_in_schema()); auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( - column_path, - file_crypto_metadata_->encryption_algorithm().algorithm, + column_path, footer_algorithm, column_key_metadata, aad); auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( - column_path, - file_crypto_metadata_->encryption_algorithm().algorithm, + column_path, footer_algorithm, column_key_metadata, aad); return PageReader::Open(stream, col->num_values(), diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 4944db078085..a7a2315fe63c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1232,7 +1232,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->file_encryption() != nullptr) { + if (props->file_encryption() != nullptr + && props->file_encryption()->getFooterSigningKey() == NULL_STRING) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } From 2eb4f3f7ef3fec18dcecd25ec52330e7acff9295 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 7 May 2019 08:59:33 +0700 Subject: [PATCH 033/201] remove EncryptionProperties --- cpp/src/parquet/properties.h | 39 -------------------------- cpp/src/parquet/types.h | 54 ------------------------------------ 2 files changed, 93 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a9ebdb5f00cc..962dd5e56850 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -391,19 +391,6 @@ class PARQUET_EXPORT WriterProperties { return parquet_file_encryption_.get(); } - inline std::shared_ptr footer_encryption() const { - if (parquet_file_encryption_ == NULLPTR) { - return NULLPTR; - } else { - std::string footer_key = parquet_file_encryption_->getFooterEncryptionKey (); - if (footer_key.empty()) - footer_key = parquet_file_encryption_->getFooterSigningKey (); - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - footer_key, parquet_file_encryption_->getFileAAD()); - - } - } - inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -456,32 +443,6 @@ class PARQUET_EXPORT WriterProperties { } } - std::shared_ptr encryption( - const std::shared_ptr& path) const { - if (parquet_file_encryption_) { - auto column_prop = parquet_file_encryption_->getColumnProperties(path); - if (column_prop == NULLPTR) - return NULLPTR; - if (column_prop->isEncryptedWithFooterKey()) { - if (parquet_file_encryption_->encryptedFooter ()) { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterEncryptionKey(), - parquet_file_encryption_->getFileAAD()); - } else { - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - parquet_file_encryption_->getFooterSigningKey(), - parquet_file_encryption_->getFileAAD()); - } - } - - return std::make_shared(parquet_file_encryption_->getAlgorithm().algorithm, - column_prop->getKey(), - parquet_file_encryption_->getFileAAD()); - } else { - return NULLPTR; - } - } - private: explicit WriterProperties( MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 2cd41833b4b0..d1b53986e657 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -479,60 +479,6 @@ struct EncryptionAlgorithm { AadMetadata aad; }; -class PARQUET_EXPORT EncryptionProperties { - public: - EncryptionProperties() = default; - EncryptionProperties(ParquetCipher::type algorithm, const std::string& key, - const std::string& file_aad, const std::string& aad = "") - : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) {} - - ~EncryptionProperties() { key_.replace(0, key_.length(), key_.length(), '\0'); } - - int key_length() const { return static_cast(key_.length()); } - uint8_t* key_bytes() const { return str2bytes(key_); } - - void aad(const std::string& aad) { aad_ = aad; } - int aad_length() const { return static_cast(aad_.length()); } - uint8_t* aad_bytes() const { return str2bytes(aad_); } - - ParquetCipher::type algorithm() const { return algorithm_; } - - const std::string& key() const { return key_; } - const std::string& aad() const { return aad_; } - const std::string& fileAAD() const { return file_aad_; } - - uint32_t CalculateCipherSize(uint32_t plain_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { - return plain_len + 28 + 4; - } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { - return plain_len + 16 + 4; - } - return plain_len; - } - - uint32_t CalculatePlainSize(uint32_t cipher_len, bool is_metadata = false) const { - if (is_metadata || algorithm_ == ParquetCipher::AES_GCM_V1) { - return cipher_len - 28 - 4; - } else if (algorithm_ == ParquetCipher::AES_GCM_CTR_V1) { - return cipher_len - 16 - 4; - } - return cipher_len; - } - - private: - ParquetCipher::type algorithm_; // encryption algorithm - std::string key_; // encryption key, should have 16, 24, 32-byte length - std::string file_aad_; - std::string aad_; // encryption additional authenticated data - static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); - } - -}; - // parquet::PageType struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; From 3cf56be38582006b7f0d3062ddaee06094d17262 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 11:34:58 +0300 Subject: [PATCH 034/201] Change HiddenColumnExceptio message --- cpp/src/parquet/internal_file_decryptor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 91a1730a2383..68e2526929cf 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -179,7 +179,7 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( } } if (column_key.empty()) { - throw HiddenColumnException("column is encrypted with null key, path=" + + throw HiddenColumnException("HiddenColumnException, path=" + column_path->ToDotString()); } From 97aad7bafcc83c67c582c2b910c1b74c946c3c62 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 11:37:53 +0300 Subject: [PATCH 035/201] Fix indentation in encryption_properties.cc --- cpp/src/parquet/encryption_properties.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index 1eb3847ec718..ba893ce1816b 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -169,7 +169,7 @@ FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, algorithm_.aad.aad_file_unique = aad_file_unique_str; algorithm_.aad.supply_aad_prefix = supply_aad_prefix; if (!aad_prefix.empty() && store_aad_prefix_in_file) { - algorithm_.aad.aad_prefix = aad_prefix; + algorithm_.aad.aad_prefix = aad_prefix; } } From 9dabd99f5cf9af21fd5ae3d93776e4f0cca23bf9 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 11:44:36 +0300 Subject: [PATCH 036/201] Rename functions in DecryptionKeyRetriever --- cpp/src/parquet/encryption.cc | 8 ++++---- cpp/src/parquet/encryption.h | 10 +++++----- cpp/src/parquet/internal_file_decryptor.cc | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 0a2d9ef939a6..542972538564 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -22,11 +22,11 @@ namespace parquet { // integer key retriever -void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { +void IntegerKeyIdRetriever::putKey(uint32_t key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { +const std::string& IntegerKeyIdRetriever::getKey(const std::string& key_metadata) { uint32_t key_id; memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); @@ -34,11 +34,11 @@ const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata } // string key retriever -void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { +void StringKeyIdRetriever::putKey(const std::string& key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { +const std::string& StringKeyIdRetriever::getKey(const std::string& key_id) { return key_map_[key_id]; } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3a4481bd4aad..60b7b8c33004 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -28,15 +28,15 @@ namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual const std::string& GetKey(const std::string& key_metadata) = 0; + virtual const std::string& getKey(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; // Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: - void PutKey(uint32_t key_id, const std::string& key); - const std::string& GetKey(const std::string& key_metadata); + void putKey(uint32_t key_id, const std::string& key); + const std::string& getKey(const std::string& key_metadata); private: std::map key_map_; @@ -45,8 +45,8 @@ class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { // Simple string key retriever class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { public: - void PutKey(const std::string& key_id, const std::string& key); - const std::string& GetKey(const std::string& key_metadata); + void putKey(const std::string& key_id, const std::string& key); + const std::string& getKey(const std::string& key_metadata); private: std::map key_map_; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 68e2526929cf..d00b5a035eb0 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -68,7 +68,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE throw ParquetException("No footer key or key retriever"); try { footer_key = - properties_->getKeyRetriever()->GetKey(footer_key_metadata); + properties_->getKeyRetriever()->getKey(footer_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; @@ -118,7 +118,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( if (properties_->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = properties_->getKeyRetriever()->GetKey(footer_key_metadata); + footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n";; @@ -168,7 +168,7 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( if (column_key.empty() && !column_key_metadata.empty() && properties_->getKeyRetriever() != nullptr){ try { - column_key = properties_->getKeyRetriever()->GetKey(column_key_metadata); + column_key = properties_->getKeyRetriever()->getKey(column_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "HiddenColumnException, path=" + From e7871d60dadd5e456d415192c473c8747d1c2720 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 16:45:18 +0300 Subject: [PATCH 037/201] Add check for aad_prefix to withoutAADPrefixStorage --- cpp/src/parquet/encryption_properties.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index f3ed589260f1..d2e4246e8a62 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -398,6 +398,8 @@ class PARQUET_EXPORT FileEncryptionProperties { // Skip storing AAD Prefix in file. // If not called, and if AAD Prefix is set, it will be stored. Builder* withoutAADPrefixStorage() { + DCHECK(!aad_prefix_.empty()); + store_aad_prefix_in_file_ = false; return this; } From 99ca2a6da6bfb6511889cd6d10ecd942ad4ad841 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 16:47:14 +0300 Subject: [PATCH 038/201] Add exception to FromThrift in thrift.h --- cpp/src/parquet/thrift.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index b7dc43be4f64..4a5a98fff8c9 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -106,10 +106,11 @@ static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encrypt if (encryption.__isset.AES_GCM_V1) { encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1); - - } else { + } else if (encryption.__isset.AES_GCM_CTR_V1) { encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1; encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1); + } else { + throw ParquetException("Unsupported algorithm"); } return encryption_algorithm; } From baa162ca5b708aa593656584fd25dfdbb66eb6ab Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 16:55:00 +0300 Subject: [PATCH 039/201] Fix prefix aad calculation --- cpp/src/parquet/file_reader.cc | 65 +++++++++++++++------------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index f9fd1a4f58c8..adc43b085b63 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -155,7 +155,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // the column is encrypted - std::string aad = parquet_encryption::createModuleAAD( + std::string aad_column_meta_data = parquet_encryption::createModuleAAD( properties_.fileAAD(), parquet_encryption::ColumnMetaData, row_group_ordinal_, @@ -172,9 +172,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { : file_crypto_metadata_->key_metadata(); auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( - footer_algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad_column_meta_data); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( - footer_algorithm, footer_key_metadata, aad); + footer_algorithm, footer_key_metadata, aad_column_meta_data); return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), row_group_ordinal_, @@ -191,10 +191,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( column_path, footer_algorithm, - column_key_metadata, aad); + column_key_metadata, aad_column_meta_data); auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( column_path, footer_algorithm, - column_key_metadata, aad); + column_key_metadata, aad_column_meta_data); return PageReader::Open(stream, col->num_values(), col->compression(), @@ -302,33 +302,28 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided"); } + std::string aad_prefix = file_decryption->getAADPrefix(); + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - bool supply_aad_prefix = algo.aad.supply_aad_prefix; - std::string aad_file_unique = algo.aad.aad_file_unique; - std::string aad_prefix = algo.aad.aad_prefix; - if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 - && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); - if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and " - "in properties is not the same"); + if (!algo.aad.aad_prefix.empty()) { + if (!aad_prefix.empty()) { + if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and " + "in properties is not the same"); + } } + aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = file_decryption->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } - if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException("AAD prefix used for file encryption, " "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD; - if (!supply_aad_prefix) - fileAAD = aad_prefix + aad_file_unique; - else - fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; file_decryptor_->file_aad(fileAAD); if (file_decryption->checkFooterIntegrity()) { @@ -385,32 +380,28 @@ class SerializedFile : public ParquetFileReader::Contents { file_crypto_metadata_ = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); - bool supply_aad_prefix = algo.aad.supply_aad_prefix; - std::string aad_file_unique = algo.aad.aad_file_unique; - std::string aad_prefix = algo.aad.aad_prefix; - if (algo.algorithm != ParquetCipher::AES_GCM_CTR_V1 - && algo.algorithm != ParquetCipher::AES_GCM_V1) - throw ParquetException("Unsupported algorithm"); - if (!file_decryption->getAADPrefix().empty()) { - if (file_decryption->getAADPrefix().compare(aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties " - "is not the same"); + + std::string aad_prefix = file_decryption->getAADPrefix(); + + if (!algo.aad.aad_prefix.empty()) { + if (!aad_prefix.empty()) { + if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + throw ParquetException("ADD Prefix in file and in properties " + "is not the same"); + } } + aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = file_decryption->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } - if (supply_aad_prefix && file_decryption->getAADPrefix().empty()) { + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException("AAD prefix used for file encryption, " "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD; - if (!supply_aad_prefix) - fileAAD = aad_prefix + aad_file_unique; - else - fileAAD = file_decryption->getAADPrefix() + aad_file_unique; + std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; // save fileAAD for later use file_decryptor_->file_aad(fileAAD); From f92df576f00b53fa7c715046605ec0de7600ef90 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 17:31:55 +0300 Subject: [PATCH 040/201] Remove fileAAD from ReaderProperties --- cpp/src/parquet/column_reader.cc | 2 -- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/properties.h | 5 ----- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index fc4c0d8fbafd..f2c358133dfa 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -108,8 +108,6 @@ ReaderProperties default_reader_properties() { if (default_reader_properties.column_map() != NULLPTR && default_reader_properties.column_map()->size () != 0) default_reader_properties.column_map()->clear(); - if (!default_reader_properties.fileAAD().empty()) - default_reader_properties.set_fileAAD (""); return default_reader_properties; } diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index adc43b085b63..e0493492c472 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -156,7 +156,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // the column is encrypted std::string aad_column_meta_data = parquet_encryption::createModuleAAD( - properties_.fileAAD(), + file_decryptor_->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal_, (int16_t)i, (int16_t)-1); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 962dd5e56850..fd311bc0d139 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -75,10 +75,6 @@ class PARQUET_EXPORT ReaderProperties { return column_map_; } - const std::string& fileAAD() { return fileAAD_; } - - void set_fileAAD (std::string fileAAD) { fileAAD_ = fileAAD; } - void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -93,7 +89,6 @@ class PARQUET_EXPORT ReaderProperties { std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between //ColumnPath and their encryption keys - std::string fileAAD_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); From f2c000eae799cae1f11e02f2031898442c2df3c6 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 17:39:31 +0300 Subject: [PATCH 041/201] Remove column_map from ReaderProperties --- cpp/src/parquet/column_reader.cc | 5 ----- cpp/src/parquet/internal_file_decryptor.h | 4 +++- cpp/src/parquet/properties.h | 12 ------------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f2c358133dfa..3a0a4188d91c 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -103,11 +103,6 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { ReaderProperties default_reader_properties() { static ReaderProperties default_reader_properties; - // reset column_map and fileAAD as default_reader_properties is static but - // can be used when reading parquet file with different reading options. - if (default_reader_properties.column_map() != NULLPTR - && default_reader_properties.column_map()->size () != 0) - default_reader_properties.column_map()->clear(); return default_reader_properties; } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index c92fc08c8738..09508ce88700 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -84,7 +84,9 @@ class InternalFileDecryptor { private: FileDecryptionProperties* properties_; + // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; + // A map between ColumnPath and their encryption keys: std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; @@ -113,4 +115,4 @@ class InternalFileDecryptor { } -#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index fd311bc0d139..b3ced91e717d 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -49,10 +49,6 @@ class PARQUET_EXPORT ReaderProperties { : pool_(pool) { buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM; buffer_size_ = DEFAULT_BUFFER_SIZE; - column_map_ = std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>>(new std::map, - std::string, - schema::ColumnPath::CmpColumnPath>()); } MemoryPool* memory_pool() const { return pool_; } @@ -70,11 +66,6 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map () { - return column_map_; - } - void file_decryption(const std::shared_ptr& decryption) { file_decryption_ = decryption; } @@ -86,9 +77,6 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size_; bool buffered_stream_enabled_; std::shared_ptr file_decryption_; - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; // a map between - //ColumnPath and their encryption keys }; ReaderProperties PARQUET_EXPORT default_reader_properties(); From 8b9574fa5ba766f9ea719e9401c6004d56d98ac0 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 7 May 2019 21:32:49 +0300 Subject: [PATCH 042/201] Fix check for encryption and the existance of file_decryption in file_reader.cc --- cpp/src/parquet/file_reader.cc | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index e0493492c472..ede78e2e243d 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -142,7 +142,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { // file is unencrypted // or file is encrypted but column is unencrypted - if ((!file_crypto_metadata_ && !file_metadata_->is_plaintext_mode()) || !crypto_metadata) { + if (!crypto_metadata) { encrypted = false; } @@ -164,7 +164,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { ParquetCipher::type footer_algorithm = file_metadata_->is_plaintext_mode() ? file_metadata_->encryption_algorithm().algorithm : file_crypto_metadata_->encryption_algorithm().algorithm; - + // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() @@ -182,13 +182,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { meta_decryptor, data_decryptor); } - // file is non-uniform encrypted and the column - // is encrypted with its own key + // file is encrypted and the column is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - + auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( column_path, footer_algorithm, column_key_metadata, aad_column_meta_data); @@ -297,10 +296,10 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_metadata_->is_plaintext_mode()) { auto file_decryption = properties_.file_decryption(); - file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); - if (file_decryption == nullptr) { + if (file_decryption == NULLPTR) { throw ParquetException("No decryption properties are provided"); } + file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); std::string aad_prefix = file_decryption->getAADPrefix(); From dc25ba1c0ba087e209ea31de9b2d6f4618a37ef4 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 8 May 2019 04:19:26 +0300 Subject: [PATCH 043/201] Save footer_key_metadata, algorithm, footer_decryptor and footer_signing_encryptor in InternalFileDecryptor and remove redundant AAD calculation in file_reader.cc --- cpp/src/parquet/file_reader.cc | 54 ++++------------ cpp/src/parquet/internal_file_decryptor.cc | 74 ++++++++++------------ cpp/src/parquet/internal_file_decryptor.h | 47 +++++++------- cpp/src/parquet/metadata.cc | 28 ++++---- cpp/src/parquet/metadata.h | 3 - 5 files changed, 81 insertions(+), 125 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index ede78e2e243d..707ed0b6e0c7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -102,17 +102,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { const ReaderProperties* properties() const override { return &properties_; } std::unique_ptr GetColumnPageReader(int i) override { - EncryptionAlgorithm algorithm; - if (file_crypto_metadata_) { - algorithm = file_crypto_metadata_->encryption_algorithm(); - } - else if (file_metadata_->is_plaintext_mode()) { - algorithm = file_metadata_->encryption_algorithm(); - } // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, properties_.file_decryption(), - &algorithm, file_decryptor_); + file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -155,26 +148,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // the column is encrypted - std::string aad_column_meta_data = parquet_encryption::createModuleAAD( - file_decryptor_->file_aad(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - (int16_t)i, (int16_t)-1); - - ParquetCipher::type footer_algorithm = file_metadata_->is_plaintext_mode() - ? file_metadata_->encryption_algorithm().algorithm - : file_crypto_metadata_->encryption_algorithm().algorithm; // the column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { - const std::string& footer_key_metadata = file_metadata_->is_plaintext_mode() - ? file_metadata_->footer_signing_key_metadata() - : file_crypto_metadata_->key_metadata(); - - auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta( - footer_algorithm, footer_key_metadata, aad_column_meta_data); - auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData( - footer_algorithm, footer_key_metadata, aad_column_meta_data); + auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); + auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); return PageReader::Open(stream, col->num_values(), col->compression(), col->has_dictionary_page(), row_group_ordinal_, @@ -189,11 +167,11 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::make_shared(crypto_metadata->path_in_schema()); auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( - column_path, footer_algorithm, - column_key_metadata, aad_column_meta_data); + column_path, + column_key_metadata); auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( - column_path, footer_algorithm, - column_key_metadata, aad_column_meta_data); + column_path, + column_key_metadata); return PageReader::Open(stream, col->num_values(), col->compression(), @@ -325,16 +303,15 @@ class SerializedFile : public ParquetFileReader::Contents { std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; file_decryptor_->file_aad(fileAAD); + file_decryptor_->algorithm(algo.algorithm); + file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); if (file_decryption->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); } - std::string footer_key_metadata = file_metadata_->footer_signing_key_metadata(); - auto encryptor = file_decryptor_->GetFooterSigningEncryptor( - file_metadata_->encryption_algorithm().algorithm, - footer_key_metadata); + auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); if (! file_metadata_->verify(encryptor, metadata_buffer->data() + read_metadata_len)) { throw ParquetException("Invalid parquet file. Could not verify plaintext" @@ -403,7 +380,9 @@ class SerializedFile : public ParquetFileReader::Contents { std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; // save fileAAD for later use file_decryptor_->file_aad(fileAAD); - + file_decryptor_->algorithm(algo.algorithm); + file_decryptor_->footer_key_metadata(file_crypto_metadata_->key_metadata()); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; @@ -414,12 +393,7 @@ class SerializedFile : public ParquetFileReader::Contents { "Could not read footer metadata bytes."); } - // get footer key metadata - std::string footer_key_metadata = file_crypto_metadata_->key_metadata(); - - auto footer_decryptor = file_decryptor_->GetFooterDecryptor( - file_crypto_metadata_->encryption_algorithm().algorithm, - footer_key_metadata); + auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d00b5a035eb0..9f71ac03de67 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -55,20 +55,20 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* p InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) : properties_(properties) {} -std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) +std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) + return footer_signing_encryptor_; std::string footer_key = properties_->getFooterKey(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { - if (footer_key_metadata.empty()) + if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); if (properties_->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { footer_key = - properties_->getKeyRetriever()->getKey(footer_key_metadata); + properties_->getKeyRetriever()->getKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; @@ -82,43 +82,39 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE std::string aad = parquet_encryption::createFooterAAD(file_aad_); - return std::make_shared( - algorithm, footer_key, file_aad_, aad); + footer_signing_encryptor_ = std::make_shared( + algorithm_, footer_key, file_aad_, aad); + return footer_signing_encryptor_; } -std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) { +std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); - return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); + return GetFooterDecryptor(aad, true); } std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, - const std::string& aad) { - return GetFooterDecryptor(algorithm, footer_key_metadata, aad, true); + const std::string& aad) +{ + return GetFooterDecryptor(aad, true); } std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnData( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, const std::string& aad) { - return GetFooterDecryptor(algorithm, footer_key_metadata, aad, false); + return GetFooterDecryptor(aad, false); } std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, const std::string& aad, bool metadata) { + if (footer_decryptor_ != NULLPTR) + return footer_decryptor_; std::string footer_key = properties_->getFooterKey(); if (footer_key.empty()) { - if (footer_key_metadata.empty()) + if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); if (properties_->getKeyRetriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata); + footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n";; @@ -131,30 +127,28 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( } auto aes_decryptor = metadata - ? GetMetaAesDecryptor(algorithm, footer_key.size()) - : GetDataAesDecryptor(algorithm, footer_key.size()); - return std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + ? GetMetaAesDecryptor(footer_key.size()) + : GetDataAesDecryptor(footer_key.size()); + footer_decryptor_ = std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + return footer_decryptor_; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad) { - return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, true); + return GetColumnDecryptor(column_path, column_key_metadata, aad, true); } std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad) { - return GetColumnDecryptor(column_path, algorithm, column_key_metadata, aad, false); + return GetColumnDecryptor(column_path, column_key_metadata, aad, false); } std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad, bool metadata) { std::string column_key; @@ -189,30 +183,30 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( } auto aes_decryptor = metadata - ? GetMetaAesDecryptor(algorithm, column_key.size()) - : GetDataAesDecryptor(algorithm, column_key.size()); + ? GetMetaAesDecryptor(column_key.size()) + : GetDataAesDecryptor(column_key.size()); return std::make_shared(aes_decryptor, column_key, file_aad_, aad); } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( - ParquetCipher::type algorithm, size_t key_size) { + size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_256_.get(); } @@ -220,23 +214,23 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( - ParquetCipher::type algorithm, size_t key_size) { + size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm, key_len, false)); + data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 09508ce88700..6601e9a2b7cf 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -51,44 +51,46 @@ class Decryptor { class InternalFileDecryptor { public: - explicit InternalFileDecryptor(FileDecryptionProperties* propperties); + explicit InternalFileDecryptor(FileDecryptionProperties* properties); void file_aad(const std::string& file_aad) { file_aad_ = file_aad; } std::string& file_aad() { return file_aad_; } - std::shared_ptr GetFooterSigningEncryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + void algorithm(ParquetCipher::type algorithm) { algorithm_ = algorithm; } + ParquetCipher::type algorithm() { return algorithm_; } - std::shared_ptr GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + void footer_key_metadata(const std::string& footer_key_metadata) { + footer_key_metadata_ = footer_key_metadata; + } + std::string& footer_key_metadata() { return footer_key_metadata_; } + + std::shared_ptr GetFooterSigningEncryptor(); + + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, - const std::string& aad); + const std::string& aad = ""); std::shared_ptr GetFooterDecryptorForColumnData( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, - const std::string& aad); + const std::string& aad = ""); std::shared_ptr GetColumnMetaDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, - const std::string& aad); + const std::string& aad = ""); std::shared_ptr GetColumnDataDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, - const std::string& aad); + const std::string& aad = ""); private: FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - // A map between ColumnPath and their encryption keys: + // A map between ColumnPath and their encryption keys std::shared_ptr, std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; + ParquetCipher::type algorithm_; + std::string footer_key_metadata_; + std::shared_ptr footer_decryptor_; + std::shared_ptr footer_signing_encryptor_; std::unique_ptr meta_decryptor_128_; std::unique_ptr meta_decryptor_196_; @@ -98,19 +100,14 @@ class InternalFileDecryptor { std::unique_ptr data_decryptor_256_; std::shared_ptr GetFooterDecryptor( - ParquetCipher::type algorithm, - const std::string& footer_key_metadata, const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( std::shared_ptr column_path, - ParquetCipher::type algorithm, const std::string& column_key_metadata, const std::string& aad, bool metadata = false); - parquet_encryption::AesDecryptor* GetMetaAesDecryptor(ParquetCipher::type algorithm, - size_t key_size); - parquet_encryption::AesDecryptor* GetDataAesDecryptor(ParquetCipher::type algorithm, - size_t key_size); + parquet_encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + parquet_encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); }; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index a7a2315fe63c..c168094b172c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -180,7 +180,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -190,25 +189,24 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (file_decryption == NULLPTR) { - throw ParquetException("Cannot decrypt ColumnMetadata. FileDecryptionProperties must be provided."); + throw ParquetException("Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); } // should decrypt metadata std::shared_ptr path = std::make_shared( ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - - DCHECK(algorithm != NULLPTR); + DCHECK(file_decryptor != NULLPTR); - - std::string aad = parquet_encryption::createModuleAAD( + + std::string aad_column_metadata = parquet_encryption::createModuleAAD( file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal, column_ordinal, (int16_t)-1); auto decryptor = file_decryptor->GetColumnMetaDecryptor( - path, algorithm->algorithm, - key_metadata, aad); + path, key_metadata, aad_column_metadata); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast( @@ -314,12 +312,11 @@ std::unique_ptr ColumnChunkMetaData::Make( int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, file_decryptor)); + file_decryption, file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData( @@ -329,7 +326,6 @@ ColumnChunkMetaData::ColumnChunkMetaData( int16_t column_ordinal, const ApplicationVersion* writer_version, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), @@ -337,7 +333,7 @@ ColumnChunkMetaData::ColumnChunkMetaData( row_group_ordinal, column_ordinal, writer_version, - file_decryption, algorithm, + file_decryption, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -421,7 +417,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; @@ -432,7 +427,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { return ColumnChunkMetaData::Make( &row_group_->columns[i], schema_->Column(i), row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, algorithm, + writer_version_, file_decryption, file_decryptor); } @@ -466,9 +461,8 @@ const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema( std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, - const EncryptionAlgorithm* algorithm, InternalFileDecryptor* file_decryptor) const { - return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, algorithm, - file_decryptor); + InternalFileDecryptor* file_decryptor) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, file_decryptor); } // file metadata diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 17401da23b67..4f85064f5531 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -128,7 +128,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -161,7 +160,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -185,7 +183,6 @@ class PARQUET_EXPORT RowGroupMetaData { const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, FileDecryptionProperties* file_decryption = NULLPTR, - const EncryptionAlgorithm* algorithm = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) const; private: From 5fdb0c7a729f53c68bfbd65befcb1bfb695b252e Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 8 May 2019 08:21:27 +0300 Subject: [PATCH 044/201] Rename file_decryption to file_decryption_properties in properties.h --- .../low-level-api/encryption-reader-writer.cc | 2 +- cpp/src/parquet/file_reader.cc | 24 +++++++++---------- cpp/src/parquet/properties.h | 11 +++++---- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 666c1a07e76e..9396202ed002 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -243,7 +243,7 @@ int main(int argc, char** argv) { try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption(file_decryption_properties[example_id]); + reader_properties.file_decryption_properties(file_decryption_properties[example_id]); // Create a ParquetReader instance std::unique_ptr parquet_reader = diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 707ed0b6e0c7..9f80dff80dcb 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -104,7 +104,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, - properties_.file_decryption(), + properties_.file_decryption_properties(), file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && @@ -273,13 +273,13 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); if (file_metadata_->is_plaintext_mode()) { - auto file_decryption = properties_.file_decryption(); - if (file_decryption == NULLPTR) { + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); - std::string aad_prefix = file_decryption->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->getAADPrefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); if (!algo.aad.aad_prefix.empty()) { @@ -291,7 +291,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); + file_decryption_properties->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } @@ -305,7 +305,7 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor_->file_aad(fileAAD); file_decryptor_->algorithm(algo.algorithm); file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); - if (file_decryption->checkFooterIntegrity()) { + if (file_decryption_properties->checkFooterIntegrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); @@ -346,18 +346,18 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } } - auto file_decryption = properties_.file_decryption(); - if (file_decryption == nullptr) { + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == nullptr) { throw ParquetException("No decryption properties are provided. Could not read " "encrypted footer metadata"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption)); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); - std::string aad_prefix = file_decryption->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->getAADPrefix(); if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { @@ -368,7 +368,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption->getAADPrefixVerifier(); + file_decryption_properties->getAADPrefixVerifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index b3ced91e717d..48e6098849c2 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -66,17 +66,20 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } - void file_decryption(const std::shared_ptr& decryption) { - file_decryption_ = decryption; + void file_decryption_properties( + const std::shared_ptr& decryption) { + file_decryption_properties_ = decryption; } - FileDecryptionProperties* file_decryption() { return file_decryption_.get(); } + FileDecryptionProperties* file_decryption_properties() { + return file_decryption_properties_.get(); + } private: MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; - std::shared_ptr file_decryption_; + std::shared_ptr file_decryption_properties_; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); From 931829b44a8e685f3d1e0c44c9c31314c559aef6 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 8 May 2019 08:35:09 +0300 Subject: [PATCH 045/201] Do not pass file_decryption as function parameter --- cpp/src/parquet/file_reader.cc | 1 - cpp/src/parquet/internal_file_decryptor.h | 2 ++ cpp/src/parquet/metadata.cc | 17 +++++------------ cpp/src/parquet/metadata.h | 3 --- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 9f80dff80dcb..a97d93f39a5e 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -104,7 +104,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, - properties_.file_decryption_properties(), file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 6601e9a2b7cf..9646ebbb652a 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -66,6 +66,8 @@ class InternalFileDecryptor { std::shared_ptr GetFooterSigningEncryptor(); + FileDecryptionProperties* properties() { return properties_; } + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta( const std::string& aad = ""); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c168094b172c..9faf8ff8808c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -179,7 +179,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; @@ -188,7 +187,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { - if (file_decryption == NULLPTR) { + if (file_decryptor->properties() == NULLPTR) { throw ParquetException("Cannot decrypt ColumnMetadata. " "FileDecryptionProperties must be provided."); } @@ -311,12 +310,11 @@ std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, - file_decryption, file_decryptor)); + file_decryptor)); } ColumnChunkMetaData::ColumnChunkMetaData( @@ -325,7 +323,6 @@ ColumnChunkMetaData::ColumnChunkMetaData( int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version, - FileDecryptionProperties* file_decryption, InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), @@ -333,7 +330,6 @@ ColumnChunkMetaData::ColumnChunkMetaData( row_group_ordinal, column_ordinal, writer_version, - file_decryption, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -416,7 +412,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; @@ -427,8 +422,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { return ColumnChunkMetaData::Make( &row_group_->columns[i], schema_->Column(i), row_group_ordinal, (int16_t)i, - writer_version_, file_decryption, - file_decryptor); + writer_version_, file_decryptor); } private: @@ -460,9 +454,8 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } std::unique_ptr RowGroupMetaData::ColumnChunk( - int i, int16_t row_group_ordinal, FileDecryptionProperties* file_decryption, - InternalFileDecryptor* file_decryptor) const { - return impl_->ColumnChunk(i, row_group_ordinal, file_decryption, file_decryptor); + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { + return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } // file metadata diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 4f85064f5531..63c3f1c12e6b 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -127,7 +127,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -159,7 +158,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -182,7 +180,6 @@ class PARQUET_EXPORT RowGroupMetaData { // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, - FileDecryptionProperties* file_decryption = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR) const; private: From fde0627d59e5f8ba839655fa74d47660594de32b Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 8 May 2019 10:27:44 +0300 Subject: [PATCH 046/201] Rename is_plaintext_mode to is_encryption_algorithm_set --- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/metadata.cc | 6 +++--- cpp/src/parquet/metadata.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index a97d93f39a5e..f1d3bb72c9ad 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -271,7 +271,7 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); - if (file_metadata_->is_plaintext_mode()) { + if (file_metadata_->is_encryption_algorithm_set()) { auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9faf8ff8808c..a5d2a5e1daf9 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -515,7 +515,7 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } - inline bool is_plaintext_mode() const { return metadata_->__isset.encryption_algorithm; } + inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } inline EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } @@ -528,7 +528,7 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; - if (is_plaintext_mode()) { + if (is_encryption_algorithm_set()) { uint8_t* serialized_data; uint32_t serialized_len; serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); @@ -667,7 +667,7 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } -bool FileMetaData::is_plaintext_mode() const { return impl_->is_plaintext_mode(); } +bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } EncryptionAlgorithm FileMetaData::encryption_algorithm() const { return impl_->encryption_algorithm(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 63c3f1c12e6b..1f12e2355f97 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -211,7 +211,7 @@ class PARQUET_EXPORT FileMetaData { int64_t num_rows() const; int num_row_groups() const; - bool is_plaintext_mode() const; + bool is_encryption_algorithm_set() const; EncryptionAlgorithm encryption_algorithm() const; const std::string& footer_signing_key_metadata() const; ParquetVersion::type version() const; From 85dd7ee5a60622557eaf11aeee3ae5ab0b3bf353 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Thu, 9 May 2019 13:56:39 +0700 Subject: [PATCH 047/201] fix function naming --- .../low-level-api/encryption-reader-writer.cc | 22 ++-- cpp/src/parquet/column_reader.cc | 14 +-- cpp/src/parquet/column_writer.cc | 10 +- cpp/src/parquet/encryption.cc | 8 +- cpp/src/parquet/encryption.h | 10 +- cpp/src/parquet/encryption_properties.cc | 80 +++++++------ cpp/src/parquet/encryption_properties.h | 112 +++++++++--------- cpp/src/parquet/file_reader.cc | 14 +-- cpp/src/parquet/file_writer.cc | 10 +- cpp/src/parquet/internal_file_decryptor.cc | 18 +-- cpp/src/parquet/internal_file_decryptor.h | 2 +- cpp/src/parquet/internal_file_encryptor.cc | 32 ++--- cpp/src/parquet/internal_file_encryptor.h | 2 +- cpp/src/parquet/metadata.cc | 22 ++-- cpp/src/parquet/properties.h | 10 +- 15 files changed, 189 insertions(+), 177 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 9396202ed002..e0d44d60667e 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -55,7 +55,7 @@ int main(int argc, char** argv) { // uniform encryption parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); parquet::FileDecryptionProperties::Builder decryption_properties_builder_1; - decryption_properties_builder_1.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); // non-uniform with column keys std::map, @@ -63,7 +63,7 @@ int main(int argc, char** argv) { parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); - encryption_col_builder_0.withKey(COLUMN_ENCRYPTION_KEY); + encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); auto encryption_col0 = encryption_col_builder_0.build(); encryption_cols[path_ptr] = encryption_col0; @@ -72,29 +72,29 @@ int main(int argc, char** argv) { std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); - decryption_col_builder2.withKey(COLUMN_ENCRYPTION_KEY); + decryption_col_builder2.key(COLUMN_ENCRYPTION_KEY); decryption_cols[path_ptr] = decryption_col_builder2.build(); - file_encryption_builder_2.withEncryptedColumns(encryption_cols); + file_encryption_builder_2.column_properties(encryption_cols); parquet::FileDecryptionProperties::Builder decryption_properties_builder_2; - decryption_properties_builder_2.withFooterKey(FOOTER_ENCRYPTION_KEY); - decryption_properties_builder_2.withColumnKeys(decryption_cols); + decryption_properties_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_2.column_properties(decryption_cols); // plain mode footer = unencrypted footer parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_3.withPlaintextFooter(); + file_encryption_builder_3.enable_plaintext_footer(); parquet::FileDecryptionProperties::Builder decryption_properties_builder_3; - decryption_properties_builder_3.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_3.footer_key(FOOTER_ENCRYPTION_KEY); // plaintext mode footer, hidden column parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_4.withPlaintextFooter(); - file_encryption_builder_4.withEncryptedColumns(encryption_cols); // reusing encryption_cols + file_encryption_builder_4.enable_plaintext_footer(); + file_encryption_builder_4.column_properties(encryption_cols); // reusing encryption_cols parquet::FileDecryptionProperties::Builder decryption_properties_builder_4; - decryption_properties_builder_4.withFooterKey(FOOTER_ENCRYPTION_KEY); + decryption_properties_builder_4.footer_key(FOOTER_ENCRYPTION_KEY); file_encryption_properties.push_back(file_encryption_builder_1.build()); file_encryption_properties.push_back(file_encryption_builder_2.build()); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3a0a4188d91c..2bddc3b971b2 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -138,18 +138,18 @@ class SerializedPageReader : public PageReader { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); if (data_decryptor_ != NULLPTR) { - DCHECK (!data_decryptor_->fileAAD().empty()); + DCHECK (!data_decryptor_->file_aad().empty()); //prepare the AAD for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_decryptor_->fileAAD(), + data_decryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { - DCHECK (!meta_decryptor_->fileAAD().empty()); + DCHECK (!meta_decryptor_->file_aad().empty()); data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_decryptor_->fileAAD(), + meta_decryptor_->file_aad(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -229,7 +229,7 @@ std::shared_ptr SerializedPageReader::NextPage() { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { aad = parquet_encryption::createModuleAAD( - meta_decryptor_->fileAAD(), + meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -261,10 +261,10 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; if (data_decryptor_ != NULLPTR){ - DCHECK(!data_decryptor_->fileAAD().empty()); + DCHECK(!data_decryptor_->file_aad().empty()); if (current_page_is_dictionary){ aad = parquet_encryption::createModuleAAD( - data_decryptor_->fileAAD(), + data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index d855e0cf3c10..3ea5f7124cac 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -160,14 +160,14 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_ != NULLPTR) { //prepare the add for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_encryptor_->fileAAD(), + data_encryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_encryptor_->fileAAD(), + meta_encryptor_->file_aad(), parquet_encryption::DataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); @@ -199,7 +199,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { data_encryptor_->aad( - parquet_encryption::createModuleAAD(data_encryptor_->fileAAD(), + parquet_encryption::createModuleAAD(data_encryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1)); @@ -225,7 +225,7 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), + parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1)); @@ -248,7 +248,7 @@ class SerializedPageWriter : public PageWriter { fallback); if (meta_encryptor_ != nullptr){ meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->fileAAD(), + parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 542972538564..0a2d9ef939a6 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -22,11 +22,11 @@ namespace parquet { // integer key retriever -void IntegerKeyIdRetriever::putKey(uint32_t key_id, const std::string& key) { +void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& IntegerKeyIdRetriever::getKey(const std::string& key_metadata) { +const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { uint32_t key_id; memcpy(reinterpret_cast(&key_id), key_metadata.c_str(), 4); @@ -34,11 +34,11 @@ const std::string& IntegerKeyIdRetriever::getKey(const std::string& key_metadata } // string key retriever -void StringKeyIdRetriever::putKey(const std::string& key_id, const std::string& key) { +void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { key_map_.insert(std::make_pair(key_id, key)); } -const std::string& StringKeyIdRetriever::getKey(const std::string& key_id) { +const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 60b7b8c33004..3a4481bd4aad 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -28,15 +28,15 @@ namespace parquet { class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual const std::string& getKey(const std::string& key_metadata) = 0; + virtual const std::string& GetKey(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; // Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: - void putKey(uint32_t key_id, const std::string& key); - const std::string& getKey(const std::string& key_metadata); + void PutKey(uint32_t key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); private: std::map key_map_; @@ -45,8 +45,8 @@ class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { // Simple string key retriever class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { public: - void putKey(const std::string& key_id, const std::string& key); - const std::string& getKey(const std::string& key_metadata); + void PutKey(const std::string& key_id, const std::string& key); + const std::string& GetKey(const std::string& key_metadata); private: std::map key_map_; diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index ba893ce1816b..36cccdb672d2 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -23,16 +23,17 @@ namespace parquet { -ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::withKeyID( +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t *data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { throw ParquetException("key id should be in UTF8 encoding"); + } DCHECK(!key_id.empty()); - this->withKeyMetaData(key_id); + this->key_metadata(key_id); return this; } @@ -40,17 +41,20 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, - const std::string& key_metadata):column_path_(column_path){ + const std::string& key_metadata):column_path_(column_path) { DCHECK(column_path != nullptr); - if (!encrypted) + if (!encrypted) { DCHECK(key.empty() && key_metadata.empty()); + } - if (!key.empty()) + if (!key.empty()) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_) + if (encrypted_with_footer_key_){ DCHECK(key_metadata.empty()); + } encrypted_ = encrypted; key_metadata_ = key_metadata; @@ -58,22 +62,24 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( } ColumnDecryptionProperties::ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ + const std::shared_ptr& column_path, + const std::string& key):column_path_(column_path){ DCHECK(column_path != nullptr); - if (!key.empty()) + if (!key.empty()) { DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } key_ = key; } -const std::string& FileDecryptionProperties::getColumnKey( +const std::string& FileDecryptionProperties::column_key( const std::shared_ptr& column_path) { - if (column_property_map_.find(column_path) != column_property_map_.end()) { - auto column_prop = column_property_map_[column_path]; - if (column_prop != nullptr) - return column_prop->getKey(); + if (column_properties_.find(column_path) != column_properties_.end()) { + auto column_prop = column_properties_[column_path]; + if (column_prop != nullptr) { + return column_prop->key(); + } } return NULL_STRING; } @@ -86,47 +92,52 @@ FileDecryptionProperties::FileDecryptionProperties( std::shared_ptr aad_prefix_verifier, std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map) { + schema::ColumnPath::CmpColumnPath> column_properties) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || - 0 != column_property_map.size()); + 0 != column_properties.size()); - if (!footer_key.empty()) + if (!footer_key.empty()) { DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); - if (footer_key.empty() && check_plaintext_footer_integrity) + } + if (footer_key.empty() && check_plaintext_footer_integrity) { DCHECK(NULLPTR != key_retriever); + } aad_prefix_verifier_ = aad_prefix_verifier; footer_key_ = footer_key; check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; key_retriever_ = key_retriever; aad_prefix_ = aad_prefix; - column_property_map_ = column_property_map; + column_properties_ = column_properties; } -FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::withFooterKeyID( +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( std::string key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { throw ParquetException("footer key id should be in UTF8 encoding"); + } - if (key_id.empty()) + if (key_id.empty()) { return this; + } - return withFooterKeyMetadata(key_id); + return footer_key_metadata(key_id); } -std::shared_ptr FileEncryptionProperties::getColumnProperties( +std::shared_ptr FileEncryptionProperties::column_properties( const std::shared_ptr& column_path) { - if (column_property_map_.size () == 0){ + if (column_properties_.size () == 0) { auto builder = std::shared_ptr( - new ColumnEncryptionProperties::Builder (column_path)); + new ColumnEncryptionProperties::Builder(column_path)); return builder->build(); } - if (column_property_map_.find(column_path) != column_property_map_.end()) - return column_property_map_[column_path]; + if (column_properties_.find(column_path) != column_properties_.end()) { + return column_properties_[column_path]; + } return NULLPTR; } @@ -140,11 +151,11 @@ FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& - column_property_map) + column_properties) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), - column_property_map_(column_property_map){ + column_properties_(column_properties) { DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 @@ -159,10 +170,11 @@ FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, AAD_FILE_UNIQUE_LENGTH) ; bool supply_aad_prefix = false; - if (aad_prefix.empty()) - file_AAD_ = aad_file_unique_str; + if (aad_prefix.empty()) { + file_aad_ = aad_file_unique_str; + } else { - file_AAD_ = aad_prefix + aad_file_unique_str; + file_aad_ = aad_prefix + aad_file_unique_str; if (!store_aad_prefix_in_file) supply_aad_prefix = true; } algorithm_.algorithm = cipher; diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index d2e4246e8a62..738a762d0c69 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -56,7 +56,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { + Builder* key(const std::string& key) { if (key.empty ()) return this; @@ -67,7 +67,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // Set a key retrieval metadata. // use either withKeyMetaData or withKeyID, not both - Builder* withKeyMetaData(const std::string& key_metadata) { + Builder* key_metadata(const std::string& key_metadata) { DCHECK(!key_metadata.empty()); DCHECK(key_metadata_.empty()); key_metadata_ = key_metadata; @@ -77,7 +77,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // Set a key retrieval metadata (converted from String). // use either withKeyMetaData or withKeyID, not both // key_id will be converted to metadata (UTF-8 array). - Builder* withKeyID(std::string key_id); + Builder* key_id(std::string key_id); std::shared_ptr build() { return std::shared_ptr( @@ -97,11 +97,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { : column_path_(path), encrypted_(encrypted) {} }; - const std::shared_ptr& getPath() { return column_path_; } - bool isEncrypted() const { return encrypted_; } - bool isEncryptedWithFooterKey() const { return encrypted_with_footer_key_; } - const std::string& getKey() const { return key_; } - const std::string& getKeyMetaData() const { return key_metadata_; } + const std::shared_ptr& column_path() { return column_path_; } + bool is_encrypted() const { return encrypted_; } + bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; @@ -136,7 +136,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { // key metadata for this column the metadata will be ignored, // the column will be decrypted with this key. // key length must be either 16, 24 or 32 bytes. - Builder* withKey(const std::string& key) { + Builder* key(const std::string& key) { if (key.empty ()) return this; @@ -159,8 +159,8 @@ class PARQUET_EXPORT ColumnDecryptionProperties { ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - const std::shared_ptr& getPath() { return column_path_; } - const std::string& getKey() const { return key_; } + const std::shared_ptr& column_path() { return column_path_; } + const std::string& key() const { return key_; } private: const std::shared_ptr column_path_; @@ -188,7 +188,7 @@ class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { public: - Builder(){ + Builder() { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } @@ -198,7 +198,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // If explicit key is not set, footer key will be fetched from // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* withFooterKey(std::string footer_key) { + Builder* footer_key(std::string footer_key) { if (footer_key.empty ()) { return this; } @@ -213,17 +213,17 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* withColumnKeys(const std::map, + Builder* column_properties(const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& column_properties) { if (column_properties.size () == 0) return this; - if (column_property_map_.size () != 0) + if (column_properties_.size () != 0) throw ParquetException("Column properties already set"); - column_property_map_ = column_properties; + column_properties_ = column_properties; return this; } @@ -233,7 +233,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* withKeyRetriever(const std::shared_ptr& + Builder* key_retriever(const std::shared_ptr& key_retriever) { if (key_retriever == NULLPTR) return this; @@ -249,7 +249,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // - footer signing key is not available // (not passed, or not found by key retriever) // - footer content and signature don't match - Builder* withoutFooterSignatureVerification() { + Builder* disable_footer_signature_verification() { check_plaintext_footer_integrity_ = false; return this; } @@ -258,7 +258,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // A must when a prefix is used for file encryption, but not stored in file. // If AAD prefix is stored in file, it will be compared to the explicitly // supplied value and an exception will be thrown if they differ. - Builder* withAADPrefix(std::string aad_prefix) { + Builder* aad_prefix(std::string aad_prefix) { if (aad_prefix.empty()) { return this; } @@ -268,7 +268,7 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set callback for verification of AAD Prefixes stored in file. - Builder* withAADPrefixVerifier( + Builder* aad_prefix_verifier( std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; @@ -285,7 +285,7 @@ class PARQUET_EXPORT FileDecryptionProperties { check_plaintext_footer_integrity_, aad_prefix_, aad_prefix_verifier_, - column_property_map_)); + column_properties_)); } private: @@ -295,29 +295,29 @@ class PARQUET_EXPORT FileDecryptionProperties { std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; }; - const std::string& getColumnKey( + const std::string& column_key( const std::shared_ptr& column_path); - const std::string& getFooterKey() { + const std::string& footer_key() { return footer_key_; } - const std::string& getAADPrefix() { return aad_prefix_; } - std::shared_ptr getKeyRetriever() { + const std::string& aad_prefix() { return aad_prefix_; } + std::shared_ptr key_retriever() { return key_retriever_; } - bool checkFooterIntegrity() { + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } - const std::shared_ptr &getAADPrefixVerifier() { + const std::shared_ptr& aad_prefix_verifier() { return aad_prefix_verifier_; } @@ -328,7 +328,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -341,7 +341,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier, std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map); + schema::ColumnPath::CmpColumnPath> column_properties); }; class PARQUET_EXPORT FileEncryptionProperties { @@ -357,25 +357,25 @@ class PARQUET_EXPORT FileEncryptionProperties { // Create files with plaintext footer. // If not called, the files will be created with encrypted footer (default). - Builder* withPlaintextFooter() { + Builder* enable_plaintext_footer() { encrypted_footer_ = false; return this; } // Set encryption algorithm. // If not called, files will be encrypted with AES_GCM_V1 (default). - Builder* withAlgorithm(ParquetCipher::type parquet_cipher) { + Builder* algorithm(ParquetCipher::type parquet_cipher) { parquet_cipher_ = parquet_cipher; return this; } // Set a key retrieval metadata (converted from String). - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyID(std::string key_id); + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_id(std::string key_id); // Set a key retrieval metadata. - // use either withFooterKeyMetaData or withFooterKeyID, not both. - Builder* withFooterKeyMetadata(const std::string& footer_key_metadata) { + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_metadata(const std::string& footer_key_metadata) { if (footer_key_metadata.empty()) return this; @@ -385,7 +385,7 @@ class PARQUET_EXPORT FileEncryptionProperties { } // Set the file AAD Prefix. - Builder* withAADPrefix(const std::string& aad_prefix) { + Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) return this; @@ -397,7 +397,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Skip storing AAD Prefix in file. // If not called, and if AAD Prefix is set, it will be stored. - Builder* withoutAADPrefixStorage() { + Builder* disable_store_aad_prefix_storage() { DCHECK(!aad_prefix_.empty()); store_aad_prefix_in_file_ = false; @@ -407,18 +407,18 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set the list of encrypted columns and their properties (keys etc). // If not called, all columns will be encrypted with the footer key. // If called, the file columns not in the list will be left unencrypted. - Builder* withEncryptedColumns( + Builder* column_properties( const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& - encryptedColumns){ - if (encryptedColumns.size () == 0) + column_properties){ + if (column_properties.size () == 0) return this; - if (column_property_map_.size () != 0) + if (column_properties_.size () != 0) throw ParquetException("Column properties already set"); - column_property_map_ = encryptedColumns; + column_properties_ = column_properties; return this; } @@ -430,7 +430,7 @@ class PARQUET_EXPORT FileEncryptionProperties { encrypted_footer_, aad_prefix_, store_aad_prefix_in_file_, - column_property_map_)); + column_properties_)); } private: @@ -443,33 +443,33 @@ class PARQUET_EXPORT FileEncryptionProperties { bool store_aad_prefix_in_file_; std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; }; - bool encryptedFooter() const { return encrypted_footer_; } + bool encrypted_footer() const { return encrypted_footer_; } - const EncryptionAlgorithm getAlgorithm() { + const EncryptionAlgorithm algorithm() { return algorithm_; } - const std::string& getFooterEncryptionKey() { + const std::string& footer_encryption_key() { return (encrypted_footer_? footer_key_ : NULL_STRING); } - const std::string& getFooterEncryptionKeyMetadata() { + const std::string& footer_encryption_key_metadata() { return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); } - const std::string& getFooterSigningKey() { + const std::string& footer_signing_key() { return (encrypted_footer_? NULL_STRING : footer_key_); } - const std::string& getFooterSigningKeyMetadata() { + const std::string& footer_signing_key_metadata() { return (encrypted_footer_? NULL_STRING : footer_key_metadata_); } - const std::string& getFileAAD() const { return file_AAD_; } + const std::string& file_aad() const { return file_aad_; } - std::shared_ptr getColumnProperties( + std::shared_ptr column_properties( const std::shared_ptr& column_path); private: @@ -477,11 +477,11 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; - std::string file_AAD_; + std::string file_aad_; std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_property_map_; + schema::ColumnPath::CmpColumnPath> column_properties_; FileEncryptionProperties(ParquetCipher::type cipher, std::string footer_key, @@ -492,7 +492,7 @@ class PARQUET_EXPORT FileEncryptionProperties { const std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath>& - column_property_map); + column_properties); }; } // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index f1d3bb72c9ad..b7148c61d9ae 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -278,7 +278,7 @@ class SerializedFile : public ParquetFileReader::Contents { } file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); - std::string aad_prefix = file_decryption_properties->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->aad_prefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); if (!algo.aad.aad_prefix.empty()) { @@ -290,7 +290,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->getAADPrefixVerifier(); + file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } @@ -299,12 +299,12 @@ class SerializedFile : public ParquetFileReader::Contents { "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; + std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_->file_aad(fileAAD); + file_decryptor_->file_aad(file_aad); file_decryptor_->algorithm(algo.algorithm); file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); - if (file_decryption_properties->checkFooterIntegrity()) { + if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException("Invalid parquet file. Cannot verify plaintext" "mode footer."); @@ -356,7 +356,7 @@ class SerializedFile : public ParquetFileReader::Contents { FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); - std::string aad_prefix = file_decryption_properties->getAADPrefix(); + std::string aad_prefix = file_decryption_properties->aad_prefix(); if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { @@ -367,7 +367,7 @@ class SerializedFile : public ParquetFileReader::Contents { } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->getAADPrefixVerifier(); + file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 9785bb250c3a..6904d4105641 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -297,9 +297,9 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*metadata, sink_.get()); + WriteFileMetaData(*file_metadata_, sink_.get()); } else { - if (file_encryption->encryptedFooter()) { + if (file_encryption->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -315,7 +315,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } else { // footer plain mode EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption->getAlgorithm(); + EncryptionAlgorithm algo = file_encryption->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) @@ -323,7 +323,7 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( &signing_encryption, - file_encryption->getFooterSigningKeyMetadata ()); + file_encryption->footer_signing_key_metadata ()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); } @@ -401,7 +401,7 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); } else { file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); - if (file_encryption->encryptedFooter()) { + if (file_encryption->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); } else { diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 9f71ac03de67..5082cbe8839b 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -59,16 +59,16 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE { if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; - std::string footer_key = properties_->getFooterKey(); + std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); - if (properties_->getKeyRetriever() == nullptr) + if (properties_->key_retriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { footer_key = - properties_->getKeyRetriever()->getKey(footer_key_metadata_); + properties_->key_retriever()->GetKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; @@ -107,14 +107,14 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( const std::string& aad, bool metadata) { if (footer_decryptor_ != NULLPTR) return footer_decryptor_; - std::string footer_key = properties_->getFooterKey(); + std::string footer_key = properties_->footer_key(); if (footer_key.empty()) { if (footer_key_metadata_.empty()) throw ParquetException("No footer key or key metadata"); - if (properties_->getKeyRetriever() == nullptr) + if (properties_->key_retriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = properties_->getKeyRetriever()->getKey(footer_key_metadata_); + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n";; @@ -157,12 +157,12 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( && column_map_->find(column_path) != column_map_->end()) { column_key = column_map_->at(column_path); } else { - column_key = properties_->getColumnKey(column_path); + column_key = properties_->column_key(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && - properties_->getKeyRetriever() != nullptr){ + properties_->key_retriever() != nullptr){ try { - column_key = properties_->getKeyRetriever()->getKey(column_key_metadata); + column_key = properties_->key_retriever()->GetKey(column_key_metadata); } catch (KeyAccessDeniedException &e) { std::stringstream ss; ss << "HiddenColumnException, path=" + diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 9646ebbb652a..41740dc182f7 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -36,7 +36,7 @@ class Decryptor { const std::string& key, const std::string& file_aad, const std::string& aad); - const std::string& fileAAD() const { return file_aad_; } + const std::string& file_aad() const { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 7f2195cc20bb..fcd224339a8d 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -33,23 +33,23 @@ InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* propertie : properties_(properties) {} std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { - ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); - std::string footer_key = properties_->getFooterEncryptionKey(); + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); return std::make_shared(aes_encryptor, footer_key, - properties_->getFileAAD(), aad); + properties_->file_aad(), aad); } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { - ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->getFileAAD()); - std::string footer_signing_key = properties_->getFooterSigningKey(); + ParquetCipher::type algorithm = properties_->algorithm().algorithm; + std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); return std::make_shared(aes_encryptor, footer_signing_key, - properties_->getFileAAD(), aad); + properties_->file_aad(), aad); } std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( @@ -65,29 +65,29 @@ std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata) { - auto column_prop = properties_->getColumnProperties(column_path); + auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { return NULLPTR; } std::string key; - if (column_prop->isEncryptedWithFooterKey()) { - if (properties_->encryptedFooter()) { - key = properties_->getFooterEncryptionKey(); + if (column_prop->is_encrypted_with_footer_key()) { + if (properties_->encrypted_footer()) { + key = properties_->footer_encryption_key(); } else { - key = properties_->getFooterSigningKey(); + key = properties_->footer_signing_key(); } } else { - key = column_prop->getKey(); + key = column_prop->key(); } - ParquetCipher::type algorithm = properties_->getAlgorithm().algorithm; + ParquetCipher::type algorithm = properties_->algorithm().algorithm; auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size()) : GetDataAesEncryptor(algorithm, key.size()); - std::string file_aad = properties_->getFileAAD(); + std::string file_aad = properties_->file_aad(); // TODO: aad return std::make_shared(aes_encryptor, key, file_aad, ""); diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 2e3a3df04081..75c167b5339e 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -14,7 +14,7 @@ class Encryptor { Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); - const std::string& fileAAD() { return file_aad_; } + const std::string& file_aad() { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index a5d2a5e1daf9..834ebf0b345c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -944,7 +944,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const auto& encrypt_md = properties_->column_encryption_props(column_->path()); // column is unencrypted - if (!encrypt_md || !encrypt_md->isEncrypted()) { + if (!encrypt_md || !encrypt_md->is_encrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); @@ -954,12 +954,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypted with footer key format::ColumnCryptoMetaData ccmd; - if (encrypt_md->isEncryptedWithFooterKey()) { + if (encrypt_md->is_encrypted_with_footer_key()) { ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key format::EncryptionWithColumnKey eck; - eck.__set_key_metadata(encrypt_md->getKeyMetaData()); + eck.__set_key_metadata(encrypt_md->key_metadata()); eck.__set_path_in_schema(column_->path()->ToDotVector()); ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); @@ -967,12 +967,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__set_crypto_metadata(ccmd); // TODO: check file_encryption() is null or not - auto footer_key = properties_->file_encryption()->getFooterEncryptionKey(); + auto footer_key = properties_->file_encryption()->footer_encryption_key(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key - if ((footer_key.empty() && encrypt_md->isEncrypted()) || - !encrypt_md->isEncryptedWithFooterKey()) { + if ((footer_key.empty() && encrypt_md->is_encrypted()) || + !encrypt_md->is_encrypted_with_footer_key()) { // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; @@ -1220,7 +1220,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->file_encryption() != nullptr - && props->file_encryption()->getFooterSigningKey() == NULL_STRING) { + && props->file_encryption()->footer_signing_key() == NULL_STRING) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } @@ -1303,12 +1303,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption = properties_->file_encryption(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->getAlgorithm())); + crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->algorithm())); std::string key_metadata; - if (file_encryption->encryptedFooter()) - key_metadata = file_encryption->getFooterEncryptionKeyMetadata(); + if (file_encryption->encrypted_footer()) + key_metadata = file_encryption->footer_encryption_key_metadata(); else - key_metadata = file_encryption->getFooterSigningKeyMetadata(); + key_metadata = file_encryption->footer_signing_key_metadata(); if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 48e6098849c2..5e5f6cdd2e03 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -374,7 +374,7 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } inline FileEncryptionProperties* file_encryption() const { - return parquet_file_encryption_.get(); + return file_encryption_.get(); } inline Encoding::type dictionary_index_encoding() const { @@ -422,8 +422,8 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr column_encryption_props(const std::shared_ptr& path) const { - if (parquet_file_encryption_) { - return parquet_file_encryption_->getColumnProperties(path); + if (file_encryption_) { + return file_encryption_->column_properties(path); } else { return NULLPTR; } @@ -444,7 +444,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), - parquet_file_encryption_(file_encryption), + file_encryption_(file_encryption), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -455,7 +455,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; - std::shared_ptr parquet_file_encryption_; + std::shared_ptr file_encryption_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; From af3aeef17f5d760353a1393d2680cf6f313a57a0 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Thu, 9 May 2019 14:15:11 +0700 Subject: [PATCH 048/201] fix const& --- cpp/src/parquet/encryption_properties.cc | 14 ++++++------ cpp/src/parquet/encryption_properties.h | 28 ++++++++++++------------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index 36cccdb672d2..258fe642efba 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -24,7 +24,7 @@ namespace parquet { ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( - std::string key_id) { + const std::string& key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t *data = reinterpret_cast(key_id.c_str()); @@ -88,11 +88,11 @@ FileDecryptionProperties::FileDecryptionProperties( const std::string& footer_key, const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, - std::string aad_prefix, + const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - std::map, + const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties) { + schema::ColumnPath::CmpColumnPath>& column_properties) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); @@ -113,7 +113,7 @@ FileDecryptionProperties::FileDecryptionProperties( } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( - std::string key_id) { + const std::string& key_id) { //key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t* data = reinterpret_cast(key_id.c_str()); @@ -143,8 +143,8 @@ std::shared_ptr FileEncryptionProperties::column_pro } FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, + const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index 738a762d0c69..e1b4221ae68d 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -44,7 +44,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class Builder { public: // Convenience builder for regular (not nested) columns. - Builder(const std::string name) { + Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name), true); } @@ -66,7 +66,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } // Set a key retrieval metadata. - // use either withKeyMetaData or withKeyID, not both + // use either key_metadata() or key_id(), not both Builder* key_metadata(const std::string& key_metadata) { DCHECK(!key_metadata.empty()); DCHECK(key_metadata_.empty()); @@ -75,9 +75,9 @@ class PARQUET_EXPORT ColumnEncryptionProperties { } // Set a key retrieval metadata (converted from String). - // use either withKeyMetaData or withKeyID, not both + // use either key_metadata() or key_id(), not both // key_id will be converted to metadata (UTF-8 array). - Builder* key_id(std::string key_id); + Builder* key_id(const std::string& key_id); std::shared_ptr build() { return std::shared_ptr( @@ -125,7 +125,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { class Builder { public: // convenience builder for regular (not nested) columns. - Builder(const std::string name) { + Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name)); } @@ -181,7 +181,7 @@ class PARQUET_EXPORT AADPrefixVerifier { // Throws exception if an AAD prefix is wrong. // In a data set, AAD Prefixes should be collected, // and then checked for missing files. - virtual void check(std::string aad_prefix) = 0; + virtual void check(const std::string& aad_prefix) = 0; }; class PARQUET_EXPORT FileDecryptionProperties { @@ -198,7 +198,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // If explicit key is not set, footer key will be fetched from // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(std::string footer_key) { + Builder* footer_key(const std::string& footer_key) { if (footer_key.empty ()) { return this; } @@ -258,7 +258,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // A must when a prefix is used for file encryption, but not stored in file. // If AAD prefix is stored in file, it will be compared to the explicitly // supplied value and an exception will be thrown if they differ. - Builder* aad_prefix(std::string aad_prefix) { + Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) { return this; } @@ -337,11 +337,11 @@ class PARQUET_EXPORT FileDecryptionProperties { const std::string& footer_key, const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, - std::string aad_prefix, + const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - std::map, + const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties); + schema::ColumnPath::CmpColumnPath>& column_properties); }; class PARQUET_EXPORT FileEncryptionProperties { @@ -371,7 +371,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set a key retrieval metadata (converted from String). // use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_id(std::string key_id); + Builder* footer_key_id(const std::string& key_id); // Set a key retrieval metadata. // use either footer_key_metadata or footer_key_id, not both. @@ -484,8 +484,8 @@ class PARQUET_EXPORT FileEncryptionProperties { schema::ColumnPath::CmpColumnPath> column_properties_; FileEncryptionProperties(ParquetCipher::type cipher, - std::string footer_key, - std::string footer_key_metadata, + const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, From 1d51d12cc45479edef47c4daa797b813d76819d5 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 10 May 2019 19:00:35 +0700 Subject: [PATCH 049/201] make format --- cpp/src/parquet/column_reader.cc | 66 +++---- cpp/src/parquet/column_writer.cc | 69 ++++--- cpp/src/parquet/column_writer_test.cc | 10 +- cpp/src/parquet/encryption.cc | 3 +- cpp/src/parquet/encryption.h | 12 +- cpp/src/parquet/encryption_properties.cc | 70 ++++---- cpp/src/parquet/encryption_properties.h | 199 +++++++++------------ cpp/src/parquet/file_reader.cc | 112 ++++++------ cpp/src/parquet/file_writer.cc | 67 ++++--- cpp/src/parquet/internal_file_decryptor.cc | 164 +++++++++-------- cpp/src/parquet/internal_file_decryptor.h | 63 ++++--- cpp/src/parquet/internal_file_encryptor.cc | 94 +++++----- cpp/src/parquet/internal_file_encryptor.h | 42 +++-- cpp/src/parquet/metadata.cc | 186 +++++++++---------- cpp/src/parquet/metadata.h | 32 ++-- cpp/src/parquet/parquet.pc | 30 ++++ cpp/src/parquet/parquet_version.h | 24 +++ cpp/src/parquet/properties.h | 9 +- cpp/src/parquet/thrift.h | 32 ++-- 19 files changed, 651 insertions(+), 633 deletions(-) create mode 100644 cpp/src/parquet/parquet.pc create mode 100644 cpp/src/parquet/parquet_version.h diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 2bddc3b971b2..60912d55981c 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -23,8 +23,10 @@ #include #include #include +#include #include #include +#include #include "arrow/array.h" #include "arrow/builder.h" @@ -126,7 +128,7 @@ class SerializedPageReader : public PageReader { : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), first_page_(true), - column_has_dictionary_ (column_has_dictionary), + column_has_dictionary_(column_has_dictionary), row_group_ordinal_(row_group_ordinal), column_ordinal_(column_ordinal), page_ordinal_(-1), @@ -138,21 +140,17 @@ class SerializedPageReader : public PageReader { max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); if (data_decryptor_ != NULLPTR) { - DCHECK (!data_decryptor_->file_aad().empty()); - //prepare the AAD for quick update later + DCHECK(!data_decryptor_->file_aad().empty()); + // prepare the AAD for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), - parquet_encryption::DataPage, - row_group_ordinal_, + data_decryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { - DCHECK (!meta_decryptor_->file_aad().empty()); + DCHECK(!meta_decryptor_->file_aad().empty()); data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + meta_decryptor_->file_aad(), parquet_encryption::DataPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } } @@ -198,15 +196,16 @@ std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with bool current_page_is_dictionary = false; - if (column_has_dictionary_ ){ + if (column_has_dictionary_) { if (first_page_) { current_page_is_dictionary = true; first_page_ = false; - } else + } else { page_ordinal_++; - } else + } + } else { page_ordinal_++; - + } while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; @@ -229,14 +228,11 @@ std::shared_ptr SerializedPageReader::NextPage() { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { aad = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1); meta_decryptor_->aad(aad); } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, - page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); meta_decryptor_->aad(data_page_headerAAD_); } } @@ -260,14 +256,12 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - if (data_decryptor_ != NULLPTR){ + if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); - if (current_page_is_dictionary){ + if (current_page_is_dictionary) { aad = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, + row_group_ordinal_, column_ordinal_, (int16_t)-1); data_decryptor_->aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); @@ -288,8 +282,8 @@ std::shared_ptr SerializedPageReader::NextPage() { // Decrypt it if we need to if (data_decryptor_ != nullptr) { decryption_buffer_->Resize(compressed_len - data_decryptor_->CiphertextSizeDelta()); - compressed_len = data_decryptor_->Decrypt( - buffer, compressed_len, decryption_buffer_->mutable_data()); + compressed_len = data_decryptor_->Decrypt(buffer, compressed_len, + decryption_buffer_->mutable_data()); buffer = decryption_buffer_->data(); } @@ -360,15 +354,11 @@ std::shared_ptr SerializedPageReader::NextPage() { return std::shared_ptr(nullptr); } -std::unique_ptr PageReader::Open(const std::shared_ptr& stream, - int64_t total_num_rows, - Compression::type codec, - bool column_has_dictionary, - int16_t row_group_ordinal, - int16_t column_ordinal, - ::arrow::MemoryPool* pool, - std::shared_ptr meta_decryptor, - std::shared_ptr data_decryptor) { +std::unique_ptr PageReader::Open( + const std::shared_ptr& stream, int64_t total_num_rows, + Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, + int16_t column_ordinal, ::arrow::MemoryPool* pool, + std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, row_group_ordinal, column_ordinal, pool, meta_decryptor, data_decryptor)); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 3ea5f7124cac..ee8a95537f81 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -37,10 +38,10 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/thrift.h" @@ -137,8 +138,7 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { // and the page metadata. class SerializedPageWriter : public PageWriter { public: - SerializedPageWriter(const std::shared_ptr& sink, - Compression::type codec, + SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool = arrow::default_memory_pool(), @@ -156,21 +156,17 @@ class SerializedPageWriter : public PageWriter { row_group_ordinal_(row_group_ordinal), column_ordinal_(column_chunk_ordinal), meta_encryptor_(meta_encryptor), - data_encryptor_(data_encryptor){ + data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR) { - //prepare the add for quick update later + // prepare the add for quick update later data_pageAAD_ = parquet_encryption::createModuleAAD( - data_encryptor_->file_aad(), - parquet_encryption::DataPage, - row_group_ordinal_, + data_encryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), - parquet_encryption::DataPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1); + meta_encryptor_->file_aad(), parquet_encryption::DataPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -224,13 +220,12 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), - parquet_encryption::DictionaryPageHeader, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + row_group_ordinal_, column_ordinal_, (int16_t)-1)); } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -246,12 +241,10 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); - if (meta_encryptor_ != nullptr){ - meta_encryptor_->aad( - parquet_encryption::createModuleAAD(meta_encryptor_->file_aad(), - parquet_encryption::ColumnMetaData, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + if (meta_encryptor_ != nullptr) { + meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, + row_group_ordinal_, column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk metadata_->WriteTo(sink_.get(), meta_encryptor_); @@ -297,10 +290,10 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); data_encryptor_->aad(data_pageAAD_); - encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + output_data_len); - output_data_len = data_encryptor_->Encrypt( - compressed_data->data(), output_data_len, - encrypted_data_buffer->mutable_data()); + encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + + output_data_len); + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } @@ -318,11 +311,11 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, - page_ordinal_); + parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); meta_encryptor_->aad(data_page_headerAAD_); } - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); + int64_t header_size = + thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -436,15 +429,13 @@ std::unique_ptr PageWriter::Open( std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor) { if (buffered_row_group) { - return std::unique_ptr( - new BufferedPageWriter(sink, codec, metadata, - row_group_ordinal, column_chunk_ordinal, - pool, meta_encryptor, data_encryptor)); + return std::unique_ptr(new BufferedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, + meta_encryptor, data_encryptor)); } else { - return std::unique_ptr( - new SerializedPageWriter(sink, codec, metadata, - row_group_ordinal, column_chunk_ordinal, - pool, meta_encryptor, data_encryptor)); + return std::unique_ptr(new SerializedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, + meta_encryptor, data_encryptor)); } } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 58c538b9939b..c72efe47f90e 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -108,7 +108,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, this->descr_); std::unique_ptr pager = - PageWriter::Open(sink_, column_properties.compression(), nullptr, metadata_.get()); + PageWriter::Open(sink_, column_properties.compression(), metadata_.get()); std::shared_ptr writer = ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get()); return std::static_pointer_cast>(writer); @@ -245,8 +245,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = - ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, -1, -1, &app_version); + auto metadata_accessor = ColumnChunkMetaData::Make( + metadata_->contents(), this->descr_, -1, -1, &app_version); return metadata_accessor->is_stats_set(); } @@ -255,8 +255,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = - ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); + auto metadata_accessor = ColumnChunkMetaData::Make( + metadata_->contents(), this->descr_, -1, -1, &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); return {encoded_stats.has_min, encoded_stats.has_max}; } diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 0a2d9ef939a6..cc8501e88915 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include "encryption.h" +#include "parquet/encryption.h" #include +#include namespace parquet { diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3a4481bd4aad..aff37839c8c4 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -54,20 +54,20 @@ class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever { class PARQUET_EXPORT HiddenColumnException : public ParquetException { public: - HiddenColumnException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} + explicit HiddenColumnException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} }; class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { public: - KeyAccessDeniedException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} + explicit KeyAccessDeniedException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} }; class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { public: - UnsupportedOperationException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} + explicit UnsupportedOperationException(const std::string& columnPath) + : ParquetException(columnPath.c_str()) {} }; } // namespace parquet diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index 258fe642efba..f23858194700 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -18,6 +18,7 @@ #include "parquet/encryption_properties.h" #include +#include #include "arrow/util/utf8.h" @@ -25,9 +26,9 @@ namespace parquet { ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( const std::string& key_id) { - //key_id is expected to be in UTF8 encoding + // key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); - const uint8_t *data = reinterpret_cast(key_id.c_str()); + const uint8_t* data = reinterpret_cast(key_id.c_str()); if (!::arrow::util::ValidateUTF8(data, key_id.size())) { throw ParquetException("key id should be in UTF8 encoding"); } @@ -38,10 +39,9 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id } ColumnEncryptionProperties::ColumnEncryptionProperties( - bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata):column_path_(column_path) { + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata) + : column_path_(column_path) { DCHECK(column_path != nullptr); if (!encrypted) { DCHECK(key.empty() && key_metadata.empty()); @@ -52,7 +52,7 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( } encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_){ + if (encrypted_with_footer_key_) { DCHECK(key_metadata.empty()); } @@ -62,8 +62,8 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( } ColumnDecryptionProperties::ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key):column_path_(column_path){ + const std::shared_ptr& column_path, const std::string& key) + : column_path_(column_path) { DCHECK(column_path != nullptr); if (!key.empty()) { @@ -87,14 +87,12 @@ const std::string& FileDecryptionProperties::column_key( FileDecryptionProperties::FileDecryptionProperties( const std::string& footer_key, const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - const std::string& aad_prefix, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - DCHECK(!footer_key.empty() || - NULLPTR != key_retriever || + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); if (!footer_key.empty()) { @@ -114,7 +112,7 @@ FileDecryptionProperties::FileDecryptionProperties( FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( const std::string& key_id) { - //key_id is expected to be in UTF8 encoding + // key_id is expected to be in UTF8 encoding ::arrow::util::InitializeUTF8(); const uint8_t* data = reinterpret_cast(key_id.c_str()); if (!::arrow::util::ValidateUTF8(data, key_id.size())) { @@ -130,7 +128,7 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key std::shared_ptr FileEncryptionProperties::column_properties( const std::shared_ptr& column_path) { - if (column_properties_.size () == 0) { + if (column_properties_.size() == 0) { auto builder = std::shared_ptr( new ColumnEncryptionProperties::Builder(column_path)); return builder->build(); @@ -142,38 +140,32 @@ std::shared_ptr FileEncryptionProperties::column_pro return NULLPTR; } -FileEncryptionProperties::FileEncryptionProperties(ParquetCipher::type cipher, - const std::string& footer_key, - const std::string& footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, - bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties) -: footer_key_(footer_key), - footer_key_metadata_(footer_key_metadata), - encrypted_footer_(encrypted_footer), - column_properties_(column_properties) { +FileEncryptionProperties::FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_properties_(column_properties) { DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 - || footer_key.length() == 24 - || footer_key.length() == 32); + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str( - reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH) ; + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH); bool supply_aad_prefix = false; if (aad_prefix.empty()) { file_aad_ = aad_file_unique_str; - } - else { + } else { file_aad_ = aad_prefix + aad_file_unique_str; if (!store_aad_prefix_in_file) supply_aad_prefix = true; } diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index e1b4221ae68d..5848ad3fa685 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -18,22 +18,23 @@ #ifndef PARQUET_ENCRYPTION_PROPERTIES_H #define PARQUET_ENCRYPTION_PROPERTIES_H +#include #include #include #include +#include "arrow/util/logging.h" #include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "arrow/util/logging.h" #include "parquet/util/visibility.h" namespace parquet { static const std::string NULL_STRING = ""; -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM - = ParquetCipher::AES_GCM_V1; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = + ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; static constexpr bool DEFAULT_CHECK_SIGNATURE = true; @@ -44,21 +45,20 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class Builder { public: // Convenience builder for regular (not nested) columns. - Builder(const std::string& name) { + explicit Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name), true); } // Convenience builder for encrypted columns. - Builder(const std::shared_ptr& path) - : Builder(path, true) {} + explicit Builder(const std::shared_ptr& path) + : Builder(path, true) {} // Set a column-specific key. // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. Builder* key(const std::string& key) { - if (key.empty ()) - return this; + if (key.empty()) return this; DCHECK(!key.empty()); key_ = key; @@ -81,10 +81,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::shared_ptr build() { return std::shared_ptr( - new ColumnEncryptionProperties(encrypted_, - column_path_, - key_, - key_metadata_)); + new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); } private: @@ -94,7 +91,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::string key_metadata_; Builder(const std::shared_ptr& path, bool encrypted) - : column_path_(path), encrypted_(encrypted) {} + : column_path_(path), encrypted_(encrypted) {} }; const std::shared_ptr& column_path() { return column_path_; } @@ -114,10 +111,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { std::string key_; std::string key_metadata_; explicit ColumnEncryptionProperties( - bool encrypted, - const std::shared_ptr& column_path, - const std::string& key, - const std::string& key_metadata); + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata); }; class PARQUET_EXPORT ColumnDecryptionProperties { @@ -125,20 +120,18 @@ class PARQUET_EXPORT ColumnDecryptionProperties { class Builder { public: // convenience builder for regular (not nested) columns. - Builder(const std::string& name) { - Builder(schema::ColumnPath::FromDotString(name)); - } + explicit Builder(const std::string& name) + : Builder(schema::ColumnPath::FromDotString(name)) {} - Builder(const std::shared_ptr& path) - : column_path_(path) {} + explicit Builder(const std::shared_ptr& path) + : column_path_(path) {} // Set an explicit column key. If applied on a file that contains // key metadata for this column the metadata will be ignored, // the column will be decrypted with this key. // key length must be either 16, 24 or 32 bytes. Builder* key(const std::string& key) { - if (key.empty ()) - return this; + if (key.empty()) return this; DCHECK(!key.empty()); key_ = key; @@ -150,7 +143,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { new ColumnDecryptionProperties(column_path_, key_)); } - private: + private: const std::shared_ptr column_path_; std::string key_; }; @@ -170,8 +163,7 @@ class PARQUET_EXPORT ColumnDecryptionProperties { // to override key retriever (or to provide keys when key metadata and/or // key retriever are not available) explicit ColumnDecryptionProperties( - const std::shared_ptr& column_path, - const std::string& key); + const std::shared_ptr& column_path, const std::string& key); }; class PARQUET_EXPORT AADPrefixVerifier { @@ -187,10 +179,8 @@ class PARQUET_EXPORT AADPrefixVerifier { class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { - public: - Builder() { - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - } + public: + Builder() { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } // Set an explicit footer key. If applied on a file that contains // footer key metadata the metadata will be ignored, the footer @@ -199,7 +189,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. Builder* footer_key(const std::string& footer_key) { - if (footer_key.empty ()) { + if (footer_key.empty()) { return this; } DCHECK(!footer_key.empty()); @@ -213,14 +203,13 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* column_properties(const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties) { - if (column_properties.size () == 0) - return this; + Builder* column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; - if (column_properties_.size () != 0) + if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); column_properties_ = column_properties; @@ -233,10 +222,8 @@ class PARQUET_EXPORT FileDecryptionProperties { // invocation of the retriever callback. // If an explicit key is available for a footer or a column, // its key metadata will be ignored. - Builder* key_retriever(const std::shared_ptr& - key_retriever) { - if (key_retriever == NULLPTR) - return this; + Builder* key_retriever(const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; DCHECK(key_retriever_ == NULLPTR); key_retriever_ = key_retriever; @@ -268,10 +255,8 @@ class PARQUET_EXPORT FileDecryptionProperties { } // Set callback for verification of AAD Prefixes stored in file. - Builder* aad_prefix_verifier( - std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) - return this; + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; DCHECK(aad_prefix_verifier_ == NULLPTR); aad_prefix_verifier_ = aad_prefix_verifier; @@ -279,43 +264,33 @@ class PARQUET_EXPORT FileDecryptionProperties { } std::shared_ptr build() { - return std::shared_ptr( - new FileDecryptionProperties(footer_key_, - key_retriever_, - check_plaintext_footer_integrity_, - aad_prefix_, - aad_prefix_verifier_, - column_properties_)); + return std::shared_ptr(new FileDecryptionProperties( + footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, + aad_prefix_verifier_, column_properties_)); } - private: + private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; }; - const std::string& column_key( - const std::shared_ptr& column_path); + const std::string& column_key(const std::shared_ptr& column_path); - const std::string& footer_key() { - return footer_key_; - } + const std::string& footer_key() { return footer_key_; } const std::string& aad_prefix() { return aad_prefix_; } - std::shared_ptr key_retriever() { - return key_retriever_; - } + std::shared_ptr key_retriever() { return key_retriever_; } - bool check_plaintext_footer_integrity() { - return check_plaintext_footer_integrity_; - } + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } const std::shared_ptr& aad_prefix_verifier() { return aad_prefix_verifier_; @@ -327,8 +302,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -336,21 +311,20 @@ class PARQUET_EXPORT FileDecryptionProperties { FileDecryptionProperties( const std::string& footer_key, const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - const std::string& aad_prefix, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties); }; class PARQUET_EXPORT FileEncryptionProperties { public: class Builder { - public: - Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + public: + explicit Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } @@ -376,8 +350,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set a key retrieval metadata. // use either footer_key_metadata or footer_key_id, not both. Builder* footer_key_metadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) - return this; + if (footer_key_metadata.empty()) return this; DCHECK(footer_key_metadata_.empty()); footer_key_metadata_ = footer_key_metadata; @@ -386,8 +359,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Set the file AAD Prefix. Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) - return this; + if (aad_prefix.empty()) return this; DCHECK(aad_prefix_.empty()); aad_prefix_ = aad_prefix; @@ -409,13 +381,11 @@ class PARQUET_EXPORT FileEncryptionProperties { // If called, the file columns not in the list will be left unencrypted. Builder* column_properties( const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties){ - if (column_properties.size () == 0) - return this; + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; - if (column_properties_.size () != 0) + if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); column_properties_ = column_properties; @@ -423,17 +393,12 @@ class PARQUET_EXPORT FileEncryptionProperties { } std::shared_ptr build() { - return std::shared_ptr( - new FileEncryptionProperties(parquet_cipher_, - footer_key_, - footer_key_metadata_, - encrypted_footer_, - aad_prefix_, - store_aad_prefix_in_file_, - column_properties_)); + return std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, + aad_prefix_, store_aad_prefix_in_file_, column_properties_)); } - private: + private: ParquetCipher::type parquet_cipher_; bool encrypted_footer_; std::string footer_key_; @@ -442,29 +407,28 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool store_aad_prefix_in_file_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; }; bool encrypted_footer() const { return encrypted_footer_; } - const EncryptionAlgorithm algorithm() { - return algorithm_; - } + const EncryptionAlgorithm algorithm() { return algorithm_; } const std::string& footer_encryption_key() { - return (encrypted_footer_? footer_key_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_ : NULL_STRING); } const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_? footer_key_metadata_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); } const std::string& footer_signing_key() { - return (encrypted_footer_? NULL_STRING : footer_key_); + return (encrypted_footer_ ? NULL_STRING : footer_key_); } const std::string& footer_signing_key_metadata() { - return (encrypted_footer_? NULL_STRING : footer_key_metadata_); + return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); } const std::string& file_aad() const { return file_aad_; } @@ -480,19 +444,16 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string file_aad_; std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> column_properties_; - - FileEncryptionProperties(ParquetCipher::type cipher, - const std::string& footer_key, - const std::string& footer_key_metadata, - bool encrypted_footer, - const std::string& aad_prefix, - bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& - column_properties); + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; + + FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties); }; } // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b7148c61d9ae..6ef69a7de6e5 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -39,8 +39,8 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/memory.h" #include "parquet/util/crypto.h" +#include "parquet/util/memory.h" namespace parquet { @@ -93,7 +93,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { file_crypto_metadata_(file_crypto_metadata), properties_(props), row_group_ordinal_((int16_t)row_group_number), - file_decryptor_(file_decryptor){ + file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -103,8 +103,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file - auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, - file_decryptor_); + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -140,10 +139,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), - row_group_ordinal_, - (int16_t)i/* column_ordinal */, - properties_.memory_pool()); + col->has_dictionary_page(), row_group_ordinal_, + (int16_t)i/* column_ordinal */, properties_.memory_pool()); } // the column is encrypted @@ -154,9 +151,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, - (int16_t)i, properties_.memory_pool(), - meta_decryptor, data_decryptor); + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, + properties_.memory_pool(), meta_decryptor, data_decryptor); } // file is encrypted and the column is encrypted with its own key @@ -165,18 +161,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - auto meta_decryptor = file_decryptor_->GetColumnMetaDecryptor( - column_path, - column_key_metadata); - auto data_decryptor = file_decryptor_->GetColumnDataDecryptor( - column_path, - column_key_metadata); - - return PageReader::Open(stream, col->num_values(), - col->compression(), - col->has_dictionary_page(), row_group_ordinal_, - (int16_t)i, properties_.memory_pool(), - meta_decryptor, data_decryptor); + auto meta_decryptor = + file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); + auto data_decryptor = + file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + + return PageReader::Open(stream, col->num_values(), col->compression(), + col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, + properties_.memory_pool(), meta_decryptor, data_decryptor); } private: @@ -204,10 +196,9 @@ class SerializedFile : public ParquetFileReader::Contents { void Close() override {} std::shared_ptr GetRowGroup(int i) override { - std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), - file_crypto_metadata_.get(), i, properties_, - file_decryptor_.get())); + std::unique_ptr contents(new SerializedRowGroup( + source_, file_metadata_.get(), file_crypto_metadata_.get(), i, properties_, + file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -284,49 +275,51 @@ class SerializedFile : public ParquetFileReader::Contents { if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and " - "in properties is not the same"); + throw ParquetException( + "ADD Prefix in file and " + "in properties is not the same"); } } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) - aad_prefix_verifier->check(aad_prefix); + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException("AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; file_decryptor_->file_aad(file_aad); file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata(file_metadata_->footer_signing_key_metadata()); + file_decryptor_->footer_key_metadata( + file_metadata_->footer_signing_key_metadata()); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { - throw ParquetException("Invalid parquet file. Cannot verify plaintext" - "mode footer."); + throw ParquetException( + "Invalid parquet file. Cannot verify plaintext" + "mode footer."); } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (! file_metadata_->verify(encryptor, metadata_buffer->data() - + read_metadata_len)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext" - " footer metadata"); + if (!file_metadata_->verify(encryptor, + metadata_buffer->data() + read_metadata_len)) { + throw ParquetException( + "Invalid parquet file. Could not verify plaintext" + " footer metadata"); } } } - } - // encryption with encrypted footer - else { + } else { + // encryption with encrypted footer // both metadata & crypto metadata length uint32_t footer_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - kFooterSize); int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; - if (kFooterSize + footer_len > file_size) { throw ParquetException( "Invalid parquet file. File is less than " @@ -347,13 +340,14 @@ class SerializedFile : public ParquetFileReader::Contents { } auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == nullptr) { - throw ParquetException("No decryption properties are provided. Could not read " - "encrypted footer metadata"); + throw ParquetException( + "No decryption properties are provided. Could not read " + "encrypted footer metadata"); } file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; file_crypto_metadata_ = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); std::string aad_prefix = file_decryption_properties->aad_prefix(); @@ -361,20 +355,21 @@ class SerializedFile : public ParquetFileReader::Contents { if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { - throw ParquetException("ADD Prefix in file and in properties " - "is not the same"); + throw ParquetException( + "ADD Prefix in file and in properties " + "is not the same"); } } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) - aad_prefix_verifier->check(aad_prefix); + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException("AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; // save fileAAD for later use @@ -382,7 +377,8 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor_->algorithm(algo.algorithm); file_decryptor_->footer_key_metadata(file_crypto_metadata_->key_metadata()); - int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + int64_t metadata_offset = + file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; PARQUET_THROW_NOT_OK( diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 6904d4105641..5653a146488b 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -28,8 +28,8 @@ #include "parquet/platform.h" #include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" -#include "parquet/util/memory.h" #include "parquet/util/crypto.h" +#include "parquet/util/memory.h" using arrow::MemoryPool; @@ -85,8 +85,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, - int16_t row_group_ordinal, - const WriterProperties* properties, + int16_t row_group_ordinal, const WriterProperties* properties, bool buffered_row_group = false, InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), @@ -134,19 +133,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++next_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); - auto meta_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - auto data_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; - - std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, row_group_ordinal_, - (int16_t)(current_column_index_-1), - properties_->memory_pool(), false, - meta_encryptor, data_encryptor); + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; + + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + row_group_ordinal_, (int16_t)(current_column_index_ - 1), + properties_->memory_pool(), false, meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -243,18 +240,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); - auto meta_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - auto data_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, (int16_t)row_group_ordinal_, - (int16_t)current_column_index_, - properties_->memory_pool(), buffered_row_group_, - meta_encryptor, data_encryptor); + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + col_meta, (int16_t)row_group_ordinal_, + (int16_t)current_column_index_, properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -309,23 +305,24 @@ class FileSerializer : public ParquetFileWriter::Contents { auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); - uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); + uint32_t footer_and_crypto_len = + static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(PARQUET_EMAGIC, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption->algorithm(); + EncryptionAlgorithm algo = file_encryption->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, - file_encryption->footer_signing_key_metadata ()); + &signing_encryption, file_encryption->footer_signing_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); + WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, + false); } } @@ -445,8 +442,7 @@ std::unique_ptr ParquetFileWriter::Open( } void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& encryptor, - bool encrypt_footer) { + const std::shared_ptr& encryptor, bool encrypt_footer) { if (encryptor == nullptr) { // Write MetaData int64_t position = -1; @@ -464,8 +460,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin if (encrypt_footer) { // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); - } - else { + } else { uint32_t metadata_len = static_cast(sink->Tell()); file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 5082cbe8839b..1935093bbd97 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -1,5 +1,22 @@ -#include "parquet/encryption_properties.h" +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "parquet/internal_file_decryptor.h" +#include "parquet/encryption_properties.h" #include "parquet/util/crypto.h" namespace parquet { @@ -12,10 +29,11 @@ static inline uint8_t* str2bytes(const std::string& str) { return reinterpret_cast(cbytes); } -FooterSigningEncryptor::FooterSigningEncryptor( - ParquetCipher::type algorithm, const std::string& key, - const std::string& file_aad, const std::string& aad) - : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { +FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, + const std::string& key, + const std::string& file_aad, + const std::string& aad) + : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { aes_encryptor_.reset(new parquet_encryption::AesEncryptor( algorithm, static_cast(key_.size()), true)); } @@ -24,41 +42,36 @@ int FooterSigningEncryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } -int FooterSigningEncryptor::SignedFooterEncrypt( - const uint8_t* footer, int footer_len, - uint8_t* nonce, uint8_t* encrypted_footer) { +int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + uint8_t* nonce, + uint8_t* encrypted_footer) { return aes_encryptor_->SignedFooterEncrypt( - footer, footer_len, str2bytes(key_), static_cast(key_.size()), - str2bytes(aad_), static_cast(aad_.size()), nonce, encrypted_footer); + footer, footer_len, str2bytes(key_), static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), nonce, encrypted_footer); } // Decryptor -Decryptor::Decryptor( - parquet_encryption::AesDecryptor* aes_decryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad) - : aes_decryptor_(aes_decryptor), key_(key) - , file_aad_(file_aad), aad_(aad) {} - -int Decryptor::CiphertextSizeDelta() { - return aes_decryptor_->CiphertextSizeDelta(); -} +Decryptor::Decryptor(parquet_encryption::AesDecryptor* aes_decryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad) + : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} -int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext) { - return aes_decryptor_->Decrypt( - ciphertext, ciphertext_len, - str2bytes(key_), static_cast(key_.size()), - str2bytes(aad_), static_cast(aad_.size()), plaintext); +int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } + +int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, + uint8_t* plaintext) { + return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), plaintext); } // InternalFileDecryptor InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) : properties_(properties) {} -std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() -{ - if (footer_signing_encryptor_ != NULLPTR) - return footer_signing_encryptor_; +std::shared_ptr +InternalFileDecryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -67,23 +80,23 @@ std::shared_ptr InternalFileDecryptor::GetFooterSigningE if (properties_->key_retriever() == nullptr) throw ParquetException("No footer key or key retriever"); try { - footer_key = - properties_->key_retriever()->GetKey(footer_key_metadata_); - } catch (KeyAccessDeniedException &e) { + footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); + } catch (KeyAccessDeniedException& e) { std::stringstream ss; ss << "Footer key: access denied " << e.what() << "\n"; throw ParquetException(ss.str()); } } if (footer_key.empty()) { - throw ParquetException("Footer key unavailable. Could not verify " - "plaintext footer metadata"); + throw ParquetException( + "Footer key unavailable. Could not verify " + "plaintext footer metadata"); } std::string aad = parquet_encryption::createFooterAAD(file_aad_); - footer_signing_encryptor_ = std::make_shared( - algorithm_, footer_key, file_aad_, aad); + footer_signing_encryptor_ = + std::make_shared(algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } @@ -93,8 +106,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { } std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnMeta( - const std::string& aad) -{ + const std::string& aad) { return GetFooterDecryptor(aad, true); } @@ -105,8 +117,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnDat std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( const std::string& aad, bool metadata) { - if (footer_decryptor_ != NULLPTR) - return footer_decryptor_; + if (footer_decryptor_ != NULLPTR) return footer_decryptor_; std::string footer_key = properties_->footer_key(); if (footer_key.empty()) { if (footer_key_metadata_.empty()) @@ -115,58 +126,54 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( throw ParquetException("No footer key or key retriever"); try { footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_); - } catch (KeyAccessDeniedException &e) { + } catch (KeyAccessDeniedException& e) { std::stringstream ss; - ss << "Footer key: access denied " << e.what() << "\n";; + ss << "Footer key: access denied " << e.what() << "\n"; throw ParquetException(ss.str()); } } if (footer_key.empty()) { - throw ParquetException("Invalid footer encryption key. " - "Could not parse footer metadata"); + throw ParquetException( + "Invalid footer encryption key. " + "Could not parse footer metadata"); } - auto aes_decryptor = metadata - ? GetMetaAesDecryptor(footer_key.size()) - : GetDataAesDecryptor(footer_key.size()); - footer_decryptor_ = std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) + : GetDataAesDecryptor(footer_key.size()); + footer_decryptor_ = + std::make_shared(aes_decryptor, footer_key, file_aad_, aad); return footer_decryptor_; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad) { + const std::string& column_key_metadata, const std::string& aad) { return GetColumnDecryptor(column_path, column_key_metadata, aad, true); } std::shared_ptr InternalFileDecryptor::GetColumnDataDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad) { + const std::string& column_key_metadata, const std::string& aad) { return GetColumnDecryptor(column_path, column_key_metadata, aad, false); } std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad, bool metadata) { + const std::string& column_key_metadata, const std::string& aad, bool metadata) { std::string column_key; // first look if we already got the key from before - if (column_map_ != NULLPTR - && column_map_->find(column_path) != column_map_->end()) { + if (column_map_ != NULLPTR && column_map_->find(column_path) != column_map_->end()) { column_key = column_map_->at(column_path); } else { column_key = properties_->column_key(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && - properties_->key_retriever() != nullptr){ + properties_->key_retriever() != nullptr) { try { column_key = properties_->key_retriever()->GetKey(column_key_metadata); - } catch (KeyAccessDeniedException &e) { + } catch (KeyAccessDeniedException& e) { std::stringstream ss; - ss << "HiddenColumnException, path=" + - column_path->ToDotString() + " " + ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " << e.what() << "\n"; throw HiddenColumnException(ss.str()); } @@ -182,9 +189,8 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( (*column_map_)[column_path] = column_key; } - auto aes_decryptor = metadata - ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) + : GetDataAesDecryptor(column_key.size()); return std::make_shared(aes_decryptor, column_key, file_aad_, aad); } @@ -194,19 +200,20 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + meta_decryptor_128_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + meta_decryptor_196_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + meta_decryptor_256_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return meta_decryptor_256_.get(); } @@ -218,23 +225,24 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_128_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_196_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_256_.reset( + new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); } return data_decryptor_256_.get(); } throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -} // namespace parquet +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 41740dc182f7..987c8ec77b9b 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -1,13 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #ifndef INTERNAL_FILE_DECRYPTOR_H #define INTERNAL_FILE_DECRYPTOR_H #include +#include +#include + #include "parquet/schema.h" namespace parquet_encryption { - class AesDecryptor; - class AesEncryptor; -} +class AesDecryptor; +class AesEncryptor; +} // namespace parquet_encryption namespace parquet { @@ -18,8 +38,8 @@ class FooterSigningEncryptor { FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, const std::string& file_aad, const std::string& aad); int CiphertextSizeDelta(); - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, - uint8_t* nonce, uint8_t* encrypted_footer); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* nonce, + uint8_t* encrypted_footer); private: ParquetCipher::type algorithm_; @@ -32,9 +52,8 @@ class FooterSigningEncryptor { class Decryptor { public: - Decryptor(parquet_encryption::AesDecryptor* decryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad); + Decryptor(parquet_encryption::AesDecryptor* decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } @@ -69,26 +88,23 @@ class InternalFileDecryptor { FileDecryptionProperties* properties() { return properties_; } std::shared_ptr GetFooterDecryptor(); - std::shared_ptr GetFooterDecryptorForColumnMeta( - const std::string& aad = ""); - std::shared_ptr GetFooterDecryptorForColumnData( - const std::string& aad = ""); + std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); + std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); std::shared_ptr GetColumnMetaDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad = ""); + const std::string& column_key_metadata, const std::string& aad = ""); std::shared_ptr GetColumnDataDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad = ""); + const std::string& column_key_metadata, const std::string& aad = ""); private: FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; // A map between ColumnPath and their encryption keys - std::shared_ptr, - std::string, parquet::schema::ColumnPath::CmpColumnPath>> column_map_; + std::shared_ptr, std::string, + parquet::schema::ColumnPath::CmpColumnPath>> + column_map_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_decryptor_; @@ -101,17 +117,16 @@ class InternalFileDecryptor { std::unique_ptr data_decryptor_196_; std::unique_ptr data_decryptor_256_; - std::shared_ptr GetFooterDecryptor( - const std::string& aad, bool metadata); + std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( std::shared_ptr column_path, - const std::string& column_key_metadata, - const std::string& aad, bool metadata = false); + const std::string& column_key_metadata, const std::string& aad, + bool metadata = false); parquet_encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); parquet_encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); }; -} +} // namespace parquet -#endif // INTERNAL_FILE_ENCRYPTORS_H +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index fcd224339a8d..c2127f06dd12 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -1,5 +1,22 @@ -#include "parquet/encryption_properties.h" +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "parquet/internal_file_encryptor.h" +#include "parquet/encryption_properties.h" #include "parquet/util/crypto.h" namespace parquet { @@ -12,20 +29,17 @@ static inline uint8_t* str2bytes(const std::string& str) { } // Encryptor -Encryptor::Encryptor( - parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, - const std::string& file_aad, const std::string& aad) - : aes_encryptor_(aes_encryptor), key_(key) - , file_aad_(file_aad), aad_(aad) {} - -int Encryptor::CiphertextSizeDelta() { - return aes_encryptor_->CiphertextSizeDelta(); -} +Encryptor::Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, + const std::string& key, const std::string& file_aad, + const std::string& aad) + : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} + +int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) { - return aes_encryptor_->Encrypt( - plaintext, plaintext_len, str2bytes(key_), static_cast(key_.size()), - str2bytes(aad_), static_cast(aad_.size()), ciphertext); + return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_), + static_cast(key_.size()), str2bytes(aad_), + static_cast(aad_.size()), ciphertext); } // InternalFileEncryptor @@ -38,8 +52,8 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); - return std::make_shared(aes_encryptor, footer_key, - properties_->file_aad(), aad); + return std::make_shared(aes_encryptor, footer_key, properties_->file_aad(), + aad); } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { @@ -62,9 +76,9 @@ std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( return GetColumnEncryptor(column_path, false); } -std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( - const std::shared_ptr& column_path, - bool metadata) { +std::shared_ptr +InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( + const std::shared_ptr& column_path, bool metadata) { auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { return NULLPTR; @@ -77,16 +91,14 @@ std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColu } else { key = properties_->footer_signing_key(); } - } - else { + } else { key = column_prop->key(); } - + ParquetCipher::type algorithm = properties_->algorithm().algorithm; - auto aes_encryptor = metadata - ? GetMetaAesEncryptor(algorithm, key.size()) - : GetDataAesEncryptor(algorithm, key.size()); - + auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size()) + : GetDataAesEncryptor(algorithm, key.size()); + std::string file_aad = properties_->file_aad(); // TODO: aad @@ -98,19 +110,20 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_128_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, true)); } return meta_encryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_196_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, true)); } return meta_encryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_256_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, true)); } return meta_encryptor_256_.get(); } @@ -122,23 +135,24 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_128_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, false)); } return data_encryptor_128_.get(); - } - else if (key_len == 24) { + } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_196_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, false)); } return data_encryptor_196_.get(); - } - else if (key_len == 32) { + } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset(new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_256_.reset( + new parquet_encryption::AesEncryptor(algorithm, key_len, false)); } return data_encryptor_256_.get(); } throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -} // namespace parquet \ No newline at end of file +} // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 75c167b5339e..ccef1315f603 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -1,8 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H +#include +#include + +#include "parquet/schema.h" + namespace parquet_encryption { - class AesEncryptor; +class AesEncryptor; } namespace parquet { @@ -11,9 +33,8 @@ class FileEncryptionProperties; class Encryptor { public: - Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad); + Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } void aad(const std::string& aad) { aad_ = aad; } @@ -33,8 +54,10 @@ class InternalFileEncryptor { std::shared_ptr GetFooterEncryptor(); std::shared_ptr GetFooterSigningEncryptor(); - std::shared_ptr GetColumnMetaEncryptor(const std::shared_ptr& column_path); - std::shared_ptr GetColumnDataEncryptor(const std::shared_ptr& column_path); + std::shared_ptr GetColumnMetaEncryptor( + const std::shared_ptr& column_path); + std::shared_ptr GetColumnDataEncryptor( + const std::shared_ptr& column_path); private: FileEncryptionProperties* properties_; @@ -47,8 +70,7 @@ class InternalFileEncryptor { std::unique_ptr data_encryptor_256_; std::shared_ptr GetColumnEncryptor( - const std::shared_ptr& column_path, - bool metadata); + const std::shared_ptr& column_path, bool metadata); parquet_encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, size_t key_len); @@ -56,6 +78,6 @@ class InternalFileEncryptor { size_t key_len); }; -} +} // namespace parquet -#endif // INTERNAL_FILE_ENCRYPTORS_H \ No newline at end of file +#endif // INTERNAL_FILE_ENCRYPTORS_H diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 834ebf0b345c..355e01d7b601 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -23,6 +23,8 @@ #include "arrow/util/logging.h" +#include +#include // IWYU pragma: keep #include "parquet/exception.h" #include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" @@ -33,7 +35,6 @@ // ARROW-6096: The boost regex library must be used when compiling with gcc < 4.9 #if defined(PARQUET_USE_BOOST_REGEX) -#include #include // IWYU pragma: keep using ::boost::regex; using ::boost::regex_match; @@ -88,7 +89,7 @@ static std::shared_ptr MakeTypedColumnStats( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); + metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); } // Default behavior return MakeStatistics( @@ -173,14 +174,12 @@ const std::string& ColumnCryptoMetaData::key_metadata() const { // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { public: - explicit ColumnChunkMetaDataImpl( - const format::ColumnChunk* column, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, - int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor = NULLPTR) - : column_(column), descr_(descr), writer_version_(writer_version) { + explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor = NULLPTR) + : column_(column), descr_(descr), writer_version_(writer_version) { metadata_ = column->meta_data; if (column->__isset.crypto_metadata) { @@ -188,31 +187,26 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (file_decryptor->properties() == NULLPTR) { - throw ParquetException("Cannot decrypt ColumnMetadata. " - "FileDecryptionProperties must be provided."); + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); } // should decrypt metadata - std::shared_ptr path = - std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; DCHECK(file_decryptor != NULLPTR); std::string aad_column_metadata = parquet_encryption::createModuleAAD( - file_decryptor->file_aad(), - parquet_encryption::ColumnMetaData, - row_group_ordinal, - column_ordinal, (int16_t)-1); - auto decryptor = file_decryptor->GetColumnMetaDecryptor( - path, key_metadata, aad_column_metadata); + file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, + row_group_ordinal, column_ordinal, (int16_t)-1); + auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, + aad_column_metadata); uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( - reinterpret_cast( - column->encrypted_column_metadata.c_str()), - &len, &metadata_, - decryptor, - false); + reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, decryptor, false); } } for (auto encoding : metadata_.encodings) { @@ -240,8 +234,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!metadata_.__isset.statistics || - descr_->sort_order() == SortOrder::UNKNOWN) { + if (!metadata_.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { @@ -256,9 +249,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return is_stats_set() ? possible_stats_ : nullptr; } - inline Compression::type compression() const { - return FromThrift(metadata_.codec); - } + inline Compression::type compression() const { return FromThrift(metadata_.codec); } const std::vector& encodings() const { return encodings_; } @@ -272,17 +263,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline int64_t data_page_offset() const { return metadata_.data_page_offset; } - inline bool has_index_page() const { - return metadata_.__isset.index_page_offset; - } + inline bool has_index_page() const { return metadata_.__isset.index_page_offset; } - inline int64_t index_page_offset() const { - return metadata_.index_page_offset; - } + inline int64_t index_page_offset() const { return metadata_.index_page_offset; } - inline int64_t total_compressed_size() const { - return metadata_.total_compressed_size; - } + inline int64_t total_compressed_size() const { return metadata_.total_compressed_size; } inline int64_t total_uncompressed_size() const { return metadata_.total_uncompressed_size; @@ -307,30 +292,23 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, + const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, + int16_t column_ordinal, const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( - new ColumnChunkMetaData(metadata, descr, row_group_ordinal, - column_ordinal, writer_version, - file_decryptor)); + new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, + writer_version, file_decryptor)); } -ColumnChunkMetaData::ColumnChunkMetaData( - const void* metadata, - const ColumnDescriptor* descr, - int16_t row_group_ordinal, - int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor) - : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( - reinterpret_cast(metadata), - descr, - row_group_ordinal, - column_ordinal, - writer_version, - file_decryptor))} {} +ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, + const ColumnDescriptor* descr, + int16_t row_group_ordinal, + int16_t column_ordinal, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor) + : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( + reinterpret_cast(metadata), descr, + row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -406,23 +384,23 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline int64_t file_offset() const { return row_group_->file_offset; } - inline int64_t total_compressed_size() const { return row_group_->total_compressed_size; } + inline int64_t total_compressed_size() const { + return row_group_->total_compressed_size; + } inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk( - int i, int16_t row_group_ordinal, - InternalFileDecryptor* file_decryptor = NULLPTR) { + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make( - &row_group_->columns[i], schema_->Column(i), - row_group_ordinal, (int16_t)i, - writer_version_, file_decryptor); + return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), + row_group_ordinal, (int16_t)i, writer_version_, + file_decryptor); } private: @@ -463,9 +441,8 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl( - const void* metadata, uint32_t* metadata_len, - const std::shared_ptr& decryptor = nullptr) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = nullptr) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, @@ -492,16 +469,16 @@ class FileMetaData::FileMetaDataImpl { // encrypt with nonce uint8_t* nonce = const_cast(reinterpret_cast(tail)); - uint8_t* tag = const_cast(reinterpret_cast(tail)) - + parquet_encryption::NonceLength; - - std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + serialized_len); - uint32_t encrypted_len = - encryptor->SignedFooterEncrypt(serialized_data, serialized_len, - nonce, encrypted_buffer.data()); - return 0 == memcmp( - encrypted_buffer.data() + encrypted_len - parquet_encryption::GCMTagLength, - tag, parquet_encryption::GCMTagLength); + uint8_t* tag = const_cast(reinterpret_cast(tail)) + + parquet_encryption::NonceLength; + + std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + + serialized_len); + uint32_t encrypted_len = encryptor->SignedFooterEncrypt( + serialized_data, serialized_len, nonce, encrypted_buffer.data()); + return 0 == memcmp(encrypted_buffer.data() + encrypted_len - + parquet_encryption::GCMTagLength, + tag, parquet_encryption::GCMTagLength); } inline uint32_t size() const { return metadata_len_; } @@ -515,7 +492,9 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } - inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } + inline bool is_encryption_algorithm_set() const { + return metadata_->__isset.encryption_algorithm; + } inline EncryptionAlgorithm encryption_algorithm() { return FromThrift(metadata_->encryption_algorithm); } @@ -534,9 +513,10 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); // encrypt the footer key - std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); - unsigned encrypted_len = encryptor->Encrypt(serialized_data, serialized_len, - encrypted_data.data()); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); // write unencrypted footer dst->Write(serialized_data, serialized_len); @@ -544,9 +524,8 @@ class FileMetaData::FileMetaDataImpl { dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); // write tag dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, - parquet_encryption::GCMTagLength); - } - else { + parquet_encryption::GCMTagLength); + } else { serializer.Serialize(metadata_.get(), dst, encryptor, false); } } @@ -633,8 +612,7 @@ class FileMetaData::FileMetaDataImpl { }; std::shared_ptr FileMetaData::Make( - const void* metadata, - uint32_t* metadata_len, + const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr( @@ -655,7 +633,8 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify(std::shared_ptr encryptor, const void* tail) { +bool FileMetaData::verify(std::shared_ptr encryptor, + const void* tail) { return impl_->verify(encryptor, tail); } @@ -667,7 +646,9 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } -bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } +bool FileMetaData::is_encryption_algorithm_set() const { + return impl_->is_encryption_algorithm_set(); +} EncryptionAlgorithm FileMetaData::encryption_algorithm() const { return impl_->encryption_algorithm(); @@ -977,13 +958,16 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; uint32_t serialized_len; - serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); + serializer.SerializeToBuffer(&column_metadata_, &serialized_len, + &serialized_data); - std::vector encrypted_data(encryptor->CiphertextSizeDelta() + serialized_len); - unsigned encrypted_len = encryptor->Encrypt( - serialized_data, serialized_len, encrypted_data.data()); + std::vector encrypted_data(encryptor->CiphertextSizeDelta() + + serialized_len); + unsigned encrypted_len = + encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); - const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); + const char* temp = + const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Keep redacted metadata version for old readers @@ -997,8 +981,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(metadata_redacted); - } - else { + } else { // don't set meta_data column_chunk_->__isset.meta_data = true; } @@ -1206,7 +1189,8 @@ void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) { impl_->set_num_rows(num_rows); } -void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, int16_t row_group_ordinal) { +void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written, + int16_t row_group_ordinal) { impl_->Finish(total_bytes_written, row_group_ordinal); } @@ -1219,8 +1203,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->file_encryption() != nullptr - && props->file_encryption()->footer_signing_key() == NULL_STRING) { + if (props->file_encryption() != nullptr && + props->file_encryption()->footer_signing_key() == NULL_STRING) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 1f12e2355f97..f7dc0e09cb41 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -19,16 +19,16 @@ #define PARQUET_FILE_METADATA_H #include +#include #include #include #include -#include #include "arrow/util/key_value_metadata.h" #include "parquet/platform.h" -#include "parquet/schema.h" #include "parquet/properties.h" +#include "parquet/schema.h" #include "parquet/types.h" namespace parquet { @@ -124,9 +124,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, - const ApplicationVersion* writer_version = NULLPTR, + const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); ~ColumnChunkMetaData(); @@ -179,8 +178,9 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i, int16_t row_group_ordinal = -1, - InternalFileDecryptor* file_decryptor = NULLPTR) const; + std::unique_ptr ColumnChunk( + int i, int16_t row_group_ordinal = -1, + InternalFileDecryptor* file_decryptor = NULLPTR) const; private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -195,14 +195,13 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len, - const std::shared_ptr& decryptor = NULLPTR); + static std::shared_ptr Make( + const void* serialized_metadata, uint32_t* metadata_len, + const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); - bool verify(std::shared_ptr encryptor, - const void* tail); + bool verify(std::shared_ptr encryptor, const void* tail); // file metadata uint32_t size() const; @@ -221,7 +220,8 @@ class PARQUET_EXPORT FileMetaData { const ApplicationVersion& writer_version() const; - void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; + void WriteTo(::arrow::io::OutputStream* dst, + const std::shared_ptr& encryptor = NULLPTR) const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -301,6 +301,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, const std::shared_ptr& encryptor = NULLPTR); + private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); @@ -351,8 +352,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { RowGroupMetaDataBuilder* AppendRowGroup(); // Complete the Thrift structure - std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm = NULLPTR, - const std::string& footer_signing_key_metadata = ""); + std::unique_ptr Finish( + const EncryptionAlgorithm* signing_algorithm = NULLPTR, + const std::string& footer_signing_key_metadata = ""); // crypto metadata std::unique_ptr GetCryptoMetaData(); diff --git a/cpp/src/parquet/parquet.pc b/cpp/src/parquet/parquet.pc new file mode 100644 index 000000000000..e46eea65b729 --- /dev/null +++ b/cpp/src/parquet/parquet.pc @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=/usr/local +libdir=${prefix}/lib +includedir=${prefix}/include + +so_version=13 +abi_version=13 +full_so_version=13.0.0 + +Name: Apache Parquet +Description: Apache Parquet is a columnar storage format. +Version: 1.5.1-SNAPSHOT +Libs: -L${libdir} -lparquet +Cflags: -I${includedir} diff --git a/cpp/src/parquet/parquet_version.h b/cpp/src/parquet/parquet_version.h new file mode 100644 index 000000000000..dd83e45028df --- /dev/null +++ b/cpp/src/parquet/parquet_version.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_VERSION_H +#define PARQUET_VERSION_H + +// define the parquet created by version +#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT" + +#endif // PARQUET_VERSION_H diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 5e5f6cdd2e03..712fde1ddb47 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -22,17 +22,16 @@ #include #include #include +#include #include "arrow/type.h" +#include "parquet/encryption_properties.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "arrow/util/logging.h" -#include "arrow/util/utf8.h" -#include "parquet/encryption_properties.h" namespace parquet { @@ -420,8 +419,8 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr column_encryption_props(const - std::shared_ptr& path) const { + std::shared_ptr column_encryption_props( + const std::shared_ptr& path) const { if (file_encryption_) { return file_encryption_->column_properties(path); } else { diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 4a5a98fff8c9..1e2068c9646b 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -28,6 +28,7 @@ #include #endif #include +#include // TCompactProtocol requires some #defines to work right. #define SIGNED_RIGHT_SHIFT_IS 1 @@ -43,8 +44,8 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/platform.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/statistics.h" #include "parquet/util/crypto.h" @@ -85,19 +86,13 @@ static inline Compression::type FromThrift(format::CompressionCodec::type type) } static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) { - return AadMetadata { - aesGcmV1.aad_prefix, - aesGcmV1.aad_file_unique, - aesGcmV1.supply_aad_prefix - }; + return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique, + aesGcmV1.supply_aad_prefix}; } static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { - return AadMetadata { - aesGcmCtrV1.aad_prefix, - aesGcmCtrV1.aad_file_unique, - aesGcmCtrV1.supply_aad_prefix - }; + return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique, + aesGcmCtrV1.supply_aad_prefix}; } static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) { @@ -228,22 +223,20 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali uint8_t clenBytes[4]; memcpy(clenBytes, buf, 4); clen = *(reinterpret_cast(clenBytes)); - } - else { + } else { clen = *len; } // decrypt const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; std::vector decrypted_buffer(clen - decryptor->CiphertextSizeDelta()); - uint32_t decrypted_buffer_len = decryptor->Decrypt( - cipherBuf, 0, decrypted_buffer.data()); + uint32_t decrypted_buffer_len = + decryptor->Decrypt(cipherBuf, 0, decrypted_buffer.data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, deserialized_msg); - } } @@ -287,12 +280,13 @@ class ThriftSerializer { return static_cast(out_length); } else { std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); - int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, - cipher_buffer.data()); + int cipher_buffer_len = + encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); if (cipher_buffer_len > static_cast(cipher_buffer.size())) { std::stringstream ss; - ss << "cipher length is greater than cipher buffer capacity: " << cipher_buffer_len << cipher_buffer.size() << "\n"; + ss << "cipher length is greater than cipher buffer capacity: " + << cipher_buffer_len << cipher_buffer.size() << "\n"; throw ParquetException(ss.str()); } From f7ea94dae72f928b6921b518c502bd4e09c1cc0a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 08:26:56 +0300 Subject: [PATCH 050/201] Add plaintext_files_allowed --- cpp/src/parquet/encryption_properties.cc | 4 +++- cpp/src/parquet/encryption_properties.h | 26 +++++++++++++++++++++--- cpp/src/parquet/file_reader.cc | 9 +++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc index f23858194700..cf34908a1c17 100644 --- a/cpp/src/parquet/encryption_properties.cc +++ b/cpp/src/parquet/encryption_properties.cc @@ -91,7 +91,8 @@ FileDecryptionProperties::FileDecryptionProperties( std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); @@ -108,6 +109,7 @@ FileDecryptionProperties::FileDecryptionProperties( key_retriever_ = key_retriever; aad_prefix_ = aad_prefix; column_properties_ = column_properties; + plaintext_files_allowed_ = plaintext_files_allowed; } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h index 5848ad3fa685..b40acb341673 100644 --- a/cpp/src/parquet/encryption_properties.h +++ b/cpp/src/parquet/encryption_properties.h @@ -38,6 +38,7 @@ static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; class PARQUET_EXPORT ColumnEncryptionProperties { @@ -180,7 +181,10 @@ class PARQUET_EXPORT FileDecryptionProperties { public: class Builder { public: - Builder() { check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; } + Builder() { + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; + } // Set an explicit footer key. If applied on a file that contains // footer key metadata the metadata will be ignored, the footer @@ -263,10 +267,21 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } + // By default, reading plaintext (unencrypted) files is not + // allowed when using a decryptor + // - in order to detect files that were not encrypted by mistake. + // However, the default behavior can be overriden by calling this method. + // The caller should use then a different method to ensure encryption + // of files with sensitive data. + Builder* plaintext_files_allowed() { + plaintext_files_allowed_ = true; + return this; + } + std::shared_ptr build() { return std::shared_ptr(new FileDecryptionProperties( footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, - aad_prefix_verifier_, column_properties_)); + aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); } private: @@ -281,6 +296,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; }; const std::string& column_key(const std::shared_ptr& column_path); @@ -292,6 +308,8 @@ class PARQUET_EXPORT FileDecryptionProperties { bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } + bool plaintext_files_allowed() { return plaintext_files_allowed_; } + const std::shared_ptr& aad_prefix_verifier() { return aad_prefix_verifier_; } @@ -307,6 +325,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; FileDecryptionProperties( const std::string& footer_key, @@ -315,7 +334,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed); }; class PARQUET_EXPORT FileEncryptionProperties { diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 6ef69a7de6e5..9c1f57e5cfc1 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -262,7 +262,14 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); - if (file_metadata_->is_encryption_algorithm_set()) { + if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties != NULLPTR) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } + } else { auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); From f8d03dcd2a896e47ea33c7acbebe0f0860486f24 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 08:47:25 +0300 Subject: [PATCH 051/201] Remove file_crypto_metadata_ field from SerializedRowGroup and SerializedFile classes --- cpp/src/parquet/file_reader.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 9c1f57e5cfc1..096a5ef652d3 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -85,12 +85,10 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, - FileMetaData* file_metadata, - FileCryptoMetaData* file_crypto_metadata, int row_group_number, + FileMetaData* file_metadata, int row_group_number, const ReaderProperties& props, InternalFileDecryptor* file_decryptor) : source_(source), file_metadata_(file_metadata), - file_crypto_metadata_(file_crypto_metadata), properties_(props), row_group_ordinal_((int16_t)row_group_number), file_decryptor_(file_decryptor) { @@ -174,7 +172,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { private: std::shared_ptr source_; FileMetaData* file_metadata_; - FileCryptoMetaData* file_crypto_metadata_; std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; @@ -197,7 +194,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), file_crypto_metadata_.get(), i, properties_, + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -353,7 +350,7 @@ class SerializedFile : public ParquetFileReader::Contents { } file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; - file_crypto_metadata_ = + std::shared_ptr file_crypto_metadata = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); @@ -382,7 +379,7 @@ class SerializedFile : public ParquetFileReader::Contents { // save fileAAD for later use file_decryptor_->file_aad(fileAAD); file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata(file_crypto_metadata_->key_metadata()); + file_decryptor_->footer_key_metadata(file_crypto_metadata->key_metadata()); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; @@ -405,7 +402,6 @@ class SerializedFile : public ParquetFileReader::Contents { private: std::shared_ptr source_; std::shared_ptr file_metadata_; - std::shared_ptr file_crypto_metadata_; ReaderProperties properties_; std::unique_ptr file_decryptor_; }; From 58381857d5e4f923e5707f8feea02ceb99f0e6b1 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 08:58:56 +0300 Subject: [PATCH 052/201] Pass file_aad, algorithm and key_metadata to InternalFileDecryptor constructor --- cpp/src/parquet/file_reader.cc | 22 ++++++++++------------ cpp/src/parquet/internal_file_decryptor.cc | 8 ++++++-- cpp/src/parquet/internal_file_decryptor.h | 10 ++++------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 096a5ef652d3..7bbcf396c56a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -259,19 +259,17 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t read_metadata_len = metadata_len; file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); + auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file - auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties != NULLPTR) { if (!file_decryption_properties->plaintext_files_allowed()) { throw ParquetException("Applying decryption properties on plaintext file"); } } } else { - auto file_decryption_properties = properties_.file_decryption_properties(); if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); std::string aad_prefix = file_decryption_properties->aad_prefix(); @@ -297,10 +295,10 @@ class SerializedFile : public ParquetFileReader::Contents { } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_->file_aad(file_aad); - file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata( - file_metadata_->footer_signing_key_metadata()); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, + file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata())); + if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException( @@ -375,11 +373,11 @@ class SerializedFile : public ParquetFileReader::Contents { "but not stored in file and not supplied " "in decryption properties"); } - std::string fileAAD = aad_prefix + algo.aad.aad_file_unique; - // save fileAAD for later use - file_decryptor_->file_aad(fileAAD); - file_decryptor_->algorithm(algo.algorithm); - file_decryptor_->footer_key_metadata(file_crypto_metadata->key_metadata()); + std::string file_aad = aad_prefix + algo.aad.aad_file_unique; + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, + file_aad, algo.algorithm, + file_crypto_metadata->key_metadata())); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 1935093bbd97..5b6faa5c27b7 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -66,8 +66,12 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, } // InternalFileDecryptor -InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties) - : properties_(properties) {} + InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) + : properties_(properties), file_add_(file_aad), + algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) {} std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 987c8ec77b9b..320c3546ed4e 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -70,17 +70,15 @@ class Decryptor { class InternalFileDecryptor { public: - explicit InternalFileDecryptor(FileDecryptionProperties* properties); + explicit InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); - void file_aad(const std::string& file_aad) { file_aad_ = file_aad; } std::string& file_aad() { return file_aad_; } - void algorithm(ParquetCipher::type algorithm) { algorithm_ = algorithm; } ParquetCipher::type algorithm() { return algorithm_; } - void footer_key_metadata(const std::string& footer_key_metadata) { - footer_key_metadata_ = footer_key_metadata; - } std::string& footer_key_metadata() { return footer_key_metadata_; } std::shared_ptr GetFooterSigningEncryptor(); From 444a95e9fab04c9ec42d0a9c61a866287f398918 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 15:12:07 +0300 Subject: [PATCH 053/201] Fixes to previous commits --- cpp/src/parquet/file_reader.cc | 3 +-- cpp/src/parquet/internal_file_decryptor.cc | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 7bbcf396c56a..b92b1c91d57e 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -346,11 +346,10 @@ class SerializedFile : public ParquetFileReader::Contents { "No decryption properties are provided. Could not read " "encrypted footer metadata"); } - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties)); uint32_t crypto_metadata_len = footer_len; std::shared_ptr file_crypto_metadata = FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - EncryptionAlgorithm algo = file_crypto_metadata_->encryption_algorithm(); + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string aad_prefix = file_decryption_properties->aad_prefix(); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 5b6faa5c27b7..7514d37d9245 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -70,7 +70,7 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, const std::string& file_aad, ParquetCipher::type algorithm, const std::string& footer_key_metadata) - : properties_(properties), file_add_(file_aad), + : properties_(properties), file_aad_(file_aad), algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) {} std::shared_ptr From b894de3abb8738816699114650520eefa64cade3 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 15:30:19 +0300 Subject: [PATCH 054/201] Put encryption_properties.h/cc content in encryption.h/cc --- cpp/src/parquet/encryption.cc | 159 +++++++ cpp/src/parquet/encryption.h | 454 ++++++++++++++++++- cpp/src/parquet/encryption_properties.cc | 182 -------- cpp/src/parquet/encryption_properties.h | 481 --------------------- cpp/src/parquet/internal_file_decryptor.cc | 2 +- cpp/src/parquet/internal_file_encryptor.cc | 2 +- cpp/src/parquet/properties.h | 2 +- 7 files changed, 615 insertions(+), 667 deletions(-) delete mode 100644 cpp/src/parquet/encryption_properties.cc delete mode 100644 cpp/src/parquet/encryption_properties.h diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index cc8501e88915..f7c4ab30cebe 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -19,6 +19,10 @@ #include #include +#include +#include + +#include "arrow/util/utf8.h" namespace parquet { @@ -43,4 +47,159 @@ const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("key id should be in UTF8 encoding"); + } + + DCHECK(!key_id.empty()); + this->key_metadata(key_id); + return this; +} + +ColumnEncryptionProperties::ColumnEncryptionProperties( + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata) + : column_path_(column_path) { + DCHECK(column_path != nullptr); + if (!encrypted) { + DCHECK(key.empty() && key_metadata.empty()); + } + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + encrypted_with_footer_key_ = (encrypted && key.empty()); + if (encrypted_with_footer_key_) { + DCHECK(key_metadata.empty()); + } + + encrypted_ = encrypted; + key_metadata_ = key_metadata; + key_ = key; +} + +ColumnDecryptionProperties::ColumnDecryptionProperties( + const std::shared_ptr& column_path, const std::string& key) + : column_path_(column_path) { + DCHECK(column_path != nullptr); + + if (!key.empty()) { + DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); + } + + key_ = key; +} + +const std::string& FileDecryptionProperties::column_key( + const std::shared_ptr& column_path) { + if (column_properties_.find(column_path) != column_properties_.end()) { + auto column_prop = column_properties_[column_path]; + if (column_prop != nullptr) { + return column_prop->key(); + } + } + return NULL_STRING; +} + +FileDecryptionProperties::FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed) { + DCHECK(!footer_key.empty() || NULLPTR != key_retriever || + 0 != column_properties.size()); + + if (!footer_key.empty()) { + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + } + if (footer_key.empty() && check_plaintext_footer_integrity) { + DCHECK(NULLPTR != key_retriever); + } + aad_prefix_verifier_ = aad_prefix_verifier; + footer_key_ = footer_key; + check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; + key_retriever_ = key_retriever; + aad_prefix_ = aad_prefix; + column_properties_ = column_properties; + plaintext_files_allowed_ = plaintext_files_allowed; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( + const std::string& key_id) { + // key_id is expected to be in UTF8 encoding + ::arrow::util::InitializeUTF8(); + const uint8_t* data = reinterpret_cast(key_id.c_str()); + if (!::arrow::util::ValidateUTF8(data, key_id.size())) { + throw ParquetException("footer key id should be in UTF8 encoding"); + } + + if (key_id.empty()) { + return this; + } + + return footer_key_metadata(key_id); +} + +std::shared_ptr FileEncryptionProperties::column_properties( + const std::shared_ptr& column_path) { + if (column_properties_.size() == 0) { + auto builder = std::shared_ptr( + new ColumnEncryptionProperties::Builder(column_path)); + return builder->build(); + } + if (column_properties_.find(column_path) != column_properties_.end()) { + return column_properties_[column_path]; + } + + return NULLPTR; +} + +FileEncryptionProperties::FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) + : footer_key_(footer_key), + footer_key_metadata_(footer_key_metadata), + encrypted_footer_(encrypted_footer), + column_properties_(column_properties) { + DCHECK(!footer_key.empty()); + // footer_key must be either 16, 24 or 32 bytes. + DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || + footer_key.length() == 32); + + uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; + memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); + RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), + AAD_FILE_UNIQUE_LENGTH); + + bool supply_aad_prefix = false; + if (aad_prefix.empty()) { + file_aad_ = aad_file_unique_str; + } else { + file_aad_ = aad_prefix + aad_file_unique_str; + if (!store_aad_prefix_in_file) supply_aad_prefix = true; + } + algorithm_.algorithm = cipher; + algorithm_.aad.aad_file_unique = aad_file_unique_str; + algorithm_.aad.supply_aad_prefix = supply_aad_prefix; + if (!aad_prefix.empty() && store_aad_prefix_in_file) { + algorithm_.aad.aad_prefix = aad_prefix; + } +} + } // namespace parquet diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index aff37839c8c4..a54c4bb26ab6 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -20,12 +20,28 @@ #include #include +#include +#include -#include +#include "arrow/util/logging.h" +#include "parquet/encryption.h" #include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "parquet/util/visibility.h" + namespace parquet { +static const std::string NULL_STRING = ""; +static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = + ParquetCipher::AES_GCM_V1; +static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; +static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; +static constexpr bool DEFAULT_CHECK_SIGNATURE = true; +static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; +static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; + class PARQUET_EXPORT DecryptionKeyRetriever { public: virtual const std::string& GetKey(const std::string& key_metadata) = 0; @@ -70,6 +86,442 @@ class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; +class PARQUET_EXPORT ColumnEncryptionProperties { + public: + class Builder { + public: + // Convenience builder for regular (not nested) columns. + explicit Builder(const std::string& name) { + Builder(schema::ColumnPath::FromDotString(name), true); + } + + // Convenience builder for encrypted columns. + explicit Builder(const std::shared_ptr& path) + : Builder(path, true) {} + + // Set a column-specific key. + // If key is not set on an encrypted column, the column will + // be encrypted with the footer key. + // keyBytes Key length must be either 16, 24 or 32 bytes. + Builder* key(const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + // Set a key retrieval metadata. + // use either key_metadata() or key_id(), not both + Builder* key_metadata(const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either key_metadata() or key_id(), not both + // key_id will be converted to metadata (UTF-8 array). + Builder* key_id(const std::string& key_id); + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); + } + + private: + const std::shared_ptr column_path_; + bool encrypted_; + std::string key_; + std::string key_metadata_; + + Builder(const std::shared_ptr& path, bool encrypted) + : column_path_(path), encrypted_(encrypted) {} + }; + + const std::shared_ptr& column_path() { return column_path_; } + bool is_encrypted() const { return encrypted_; } + bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } + const std::string& key() const { return key_; } + const std::string& key_metadata() const { return key_metadata_; } + + ColumnEncryptionProperties() = default; + ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; + ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; + + private: + const std::shared_ptr column_path_; + bool encrypted_; + bool encrypted_with_footer_key_; + std::string key_; + std::string key_metadata_; + explicit ColumnEncryptionProperties( + bool encrypted, const std::shared_ptr& column_path, + const std::string& key, const std::string& key_metadata); +}; + +class PARQUET_EXPORT ColumnDecryptionProperties { + public: + class Builder { + public: + // convenience builder for regular (not nested) columns. + explicit Builder(const std::string& name) + : Builder(schema::ColumnPath::FromDotString(name)) {} + + explicit Builder(const std::shared_ptr& path) + : column_path_(path) {} + + // Set an explicit column key. If applied on a file that contains + // key metadata for this column the metadata will be ignored, + // the column will be decrypted with this key. + // key length must be either 16, 24 or 32 bytes. + Builder* key(const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); + } + + private: + const std::shared_ptr column_path_; + std::string key_; + }; + + ColumnDecryptionProperties() = default; + ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; + ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; + + const std::shared_ptr& column_path() { return column_path_; } + const std::string& key() const { return key_; } + + private: + const std::shared_ptr column_path_; + std::string key_; + + // This class is only required for setting explicit column decryption keys - + // to override key retriever (or to provide keys when key metadata and/or + // key retriever are not available) + explicit ColumnDecryptionProperties( + const std::shared_ptr& column_path, const std::string& key); +}; + +class PARQUET_EXPORT AADPrefixVerifier { + public: + // Verifies identity (AAD Prefix) of individual file, + // or of file collection in a data set. + // Throws exception if an AAD prefix is wrong. + // In a data set, AAD Prefixes should be collected, + // and then checked for missing files. + virtual void check(const std::string& aad_prefix) = 0; +}; + +class PARQUET_EXPORT FileDecryptionProperties { + public: + class Builder { + public: + Builder() { + check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; + plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; + } + + // Set an explicit footer key. If applied on a file that contains + // footer key metadata the metadata will be ignored, the footer + // will be decrypted/verified with this key. + // If explicit key is not set, footer key will be fetched from + // key retriever. + // param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* footer_key(const std::string& footer_key) { + if (footer_key.empty()) { + return this; + } + DCHECK(!footer_key.empty()); + footer_key_ = footer_key; + return this; + } + + // Set explicit column keys (decryption properties). + // Its also possible to set a key retriever on this property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + column_properties_ = column_properties; + return this; + } + + // Set a key retriever callback. Its also possible to + // set explicit footer or column keys on this file property object. + // Upon file decryption, availability of explicit keys is checked before + // invocation of the retriever callback. + // If an explicit key is available for a footer or a column, + // its key metadata will be ignored. + Builder* key_retriever(const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; + } + + // Skip integrity verification of plaintext footers. + // If not called, integrity of plaintext footers will be checked in runtime, + // and an exception will be thrown in the following situations: + // - footer signing key is not available + // (not passed, or not found by key retriever) + // - footer content and signature don't match + Builder* disable_footer_signature_verification() { + check_plaintext_footer_integrity_ = false; + return this; + } + + // Explicitly supply the file AAD prefix. + // A must when a prefix is used for file encryption, but not stored in file. + // If AAD prefix is stored in file, it will be compared to the explicitly + // supplied value and an exception will be thrown if they differ. + Builder* aad_prefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; + } + + // Set callback for verification of AAD Prefixes stored in file. + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; + } + + // By default, reading plaintext (unencrypted) files is not + // allowed when using a decryptor + // - in order to detect files that were not encrypted by mistake. + // However, the default behavior can be overriden by calling this method. + // The caller should use then a different method to ensure encryption + // of files with sensitive data. + Builder* plaintext_files_allowed() { + plaintext_files_allowed_ = true; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new FileDecryptionProperties( + footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, + aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + }; + + const std::string& column_key(const std::shared_ptr& column_path); + + const std::string& footer_key() { return footer_key_; } + + const std::string& aad_prefix() { return aad_prefix_; } + std::shared_ptr key_retriever() { return key_retriever_; } + + bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } + + bool plaintext_files_allowed() { return plaintext_files_allowed_; } + + const std::shared_ptr& aad_prefix_verifier() { + return aad_prefix_verifier_; + } + + private: + std::string footer_key_; + std::string aad_prefix_; + std::shared_ptr aad_prefix_verifier_; + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; + + std::shared_ptr key_retriever_; + bool check_plaintext_footer_integrity_; + bool plaintext_files_allowed_; + + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties, + bool plaintext_files_allowed); +}; + +class PARQUET_EXPORT FileEncryptionProperties { + public: + class Builder { + public: + explicit Builder(const std::string& footer_key) + : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), + encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + footer_key_ = footer_key; + store_aad_prefix_in_file_ = false; + } + + // Create files with plaintext footer. + // If not called, the files will be created with encrypted footer (default). + Builder* enable_plaintext_footer() { + encrypted_footer_ = false; + return this; + } + + // Set encryption algorithm. + // If not called, files will be encrypted with AES_GCM_V1 (default). + Builder* algorithm(ParquetCipher::type parquet_cipher) { + parquet_cipher_ = parquet_cipher; + return this; + } + + // Set a key retrieval metadata (converted from String). + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_id(const std::string& key_id); + + // Set a key retrieval metadata. + // use either footer_key_metadata or footer_key_id, not both. + Builder* footer_key_metadata(const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; + } + + // Set the file AAD Prefix. + Builder* aad_prefix(const std::string& aad_prefix) { + if (aad_prefix.empty()) return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; + } + + // Skip storing AAD Prefix in file. + // If not called, and if AAD Prefix is set, it will be stored. + Builder* disable_store_aad_prefix_storage() { + DCHECK(!aad_prefix_.empty()); + + store_aad_prefix_in_file_ = false; + return this; + } + + // Set the list of encrypted columns and their properties (keys etc). + // If not called, all columns will be encrypted with the footer key. + // If called, the file columns not in the list will be left unencrypted. + Builder* column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + column_properties_ = column_properties; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new FileEncryptionProperties( + parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, + aad_prefix_, store_aad_prefix_in_file_, column_properties_)); + } + + private: + ParquetCipher::type parquet_cipher_; + bool encrypted_footer_; + std::string footer_key_; + std::string footer_key_metadata_; + + std::string aad_prefix_; + bool store_aad_prefix_in_file_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_; + }; + bool encrypted_footer() const { return encrypted_footer_; } + + const EncryptionAlgorithm algorithm() { return algorithm_; } + + const std::string& footer_encryption_key() { + return (encrypted_footer_ ? footer_key_ : NULL_STRING); + } + + const std::string& footer_encryption_key_metadata() { + return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); + } + + const std::string& footer_signing_key() { + return (encrypted_footer_ ? NULL_STRING : footer_key_); + } + + const std::string& footer_signing_key_metadata() { + return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); + } + + const std::string& file_aad() const { return file_aad_; } + + std::shared_ptr column_properties( + const std::shared_ptr& column_path); + + private: + EncryptionAlgorithm algorithm_; + std::string footer_key_; + std::string footer_key_metadata_; + bool encrypted_footer_; + std::string file_aad_; + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_; + + FileEncryptionProperties( + ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties); +}; + + } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/encryption_properties.cc b/cpp/src/parquet/encryption_properties.cc deleted file mode 100644 index cf34908a1c17..000000000000 --- a/cpp/src/parquet/encryption_properties.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/encryption_properties.h" - -#include -#include - -#include "arrow/util/utf8.h" - -namespace parquet { - -ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( - const std::string& key_id) { - // key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) { - throw ParquetException("key id should be in UTF8 encoding"); - } - - DCHECK(!key_id.empty()); - this->key_metadata(key_id); - return this; -} - -ColumnEncryptionProperties::ColumnEncryptionProperties( - bool encrypted, const std::shared_ptr& column_path, - const std::string& key, const std::string& key_metadata) - : column_path_(column_path) { - DCHECK(column_path != nullptr); - if (!encrypted) { - DCHECK(key.empty() && key_metadata.empty()); - } - - if (!key.empty()) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - } - - encrypted_with_footer_key_ = (encrypted && key.empty()); - if (encrypted_with_footer_key_) { - DCHECK(key_metadata.empty()); - } - - encrypted_ = encrypted; - key_metadata_ = key_metadata; - key_ = key; -} - -ColumnDecryptionProperties::ColumnDecryptionProperties( - const std::shared_ptr& column_path, const std::string& key) - : column_path_(column_path) { - DCHECK(column_path != nullptr); - - if (!key.empty()) { - DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32); - } - - key_ = key; -} - -const std::string& FileDecryptionProperties::column_key( - const std::shared_ptr& column_path) { - if (column_properties_.find(column_path) != column_properties_.end()) { - auto column_prop = column_properties_[column_path]; - if (column_prop != nullptr) { - return column_prop->key(); - } - } - return NULL_STRING; -} - -FileDecryptionProperties::FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, - bool plaintext_files_allowed) { - DCHECK(!footer_key.empty() || NULLPTR != key_retriever || - 0 != column_properties.size()); - - if (!footer_key.empty()) { - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); - } - if (footer_key.empty() && check_plaintext_footer_integrity) { - DCHECK(NULLPTR != key_retriever); - } - aad_prefix_verifier_ = aad_prefix_verifier; - footer_key_ = footer_key; - check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; - key_retriever_ = key_retriever; - aad_prefix_ = aad_prefix; - column_properties_ = column_properties; - plaintext_files_allowed_ = plaintext_files_allowed; -} - -FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( - const std::string& key_id) { - // key_id is expected to be in UTF8 encoding - ::arrow::util::InitializeUTF8(); - const uint8_t* data = reinterpret_cast(key_id.c_str()); - if (!::arrow::util::ValidateUTF8(data, key_id.size())) { - throw ParquetException("footer key id should be in UTF8 encoding"); - } - - if (key_id.empty()) { - return this; - } - - return footer_key_metadata(key_id); -} - -std::shared_ptr FileEncryptionProperties::column_properties( - const std::shared_ptr& column_path) { - if (column_properties_.size() == 0) { - auto builder = std::shared_ptr( - new ColumnEncryptionProperties::Builder(column_path)); - return builder->build(); - } - if (column_properties_.find(column_path) != column_properties_.end()) { - return column_properties_[column_path]; - } - - return NULLPTR; -} - -FileEncryptionProperties::FileEncryptionProperties( - ParquetCipher::type cipher, const std::string& footer_key, - const std::string& footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) - : footer_key_(footer_key), - footer_key_metadata_(footer_key_metadata), - encrypted_footer_(encrypted_footer), - column_properties_(column_properties) { - DCHECK(!footer_key.empty()); - // footer_key must be either 16, 24 or 32 bytes. - DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || - footer_key.length() == 32); - - uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; - memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); - RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); - std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH); - - bool supply_aad_prefix = false; - if (aad_prefix.empty()) { - file_aad_ = aad_file_unique_str; - } else { - file_aad_ = aad_prefix + aad_file_unique_str; - if (!store_aad_prefix_in_file) supply_aad_prefix = true; - } - algorithm_.algorithm = cipher; - algorithm_.aad.aad_file_unique = aad_file_unique_str; - algorithm_.aad.supply_aad_prefix = supply_aad_prefix; - if (!aad_prefix.empty() && store_aad_prefix_in_file) { - algorithm_.aad.aad_prefix = aad_prefix; - } -} - -} // namespace parquet diff --git a/cpp/src/parquet/encryption_properties.h b/cpp/src/parquet/encryption_properties.h deleted file mode 100644 index b40acb341673..000000000000 --- a/cpp/src/parquet/encryption_properties.h +++ /dev/null @@ -1,481 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ENCRYPTION_PROPERTIES_H -#define PARQUET_ENCRYPTION_PROPERTIES_H - -#include -#include -#include -#include - -#include "arrow/util/logging.h" -#include "parquet/encryption.h" -#include "parquet/exception.h" -#include "parquet/schema.h" -#include "parquet/types.h" -#include "parquet/util/visibility.h" - -namespace parquet { - -static const std::string NULL_STRING = ""; -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = - ParquetCipher::AES_GCM_V1; -static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; -static constexpr bool DEFAULT_CHECK_SIGNATURE = true; -static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; -static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; - -class PARQUET_EXPORT ColumnEncryptionProperties { - public: - class Builder { - public: - // Convenience builder for regular (not nested) columns. - explicit Builder(const std::string& name) { - Builder(schema::ColumnPath::FromDotString(name), true); - } - - // Convenience builder for encrypted columns. - explicit Builder(const std::shared_ptr& path) - : Builder(path, true) {} - - // Set a column-specific key. - // If key is not set on an encrypted column, the column will - // be encrypted with the footer key. - // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - // Set a key retrieval metadata. - // use either key_metadata() or key_id(), not both - Builder* key_metadata(const std::string& key_metadata) { - DCHECK(!key_metadata.empty()); - DCHECK(key_metadata_.empty()); - key_metadata_ = key_metadata; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either key_metadata() or key_id(), not both - // key_id will be converted to metadata (UTF-8 array). - Builder* key_id(const std::string& key_id); - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_)); - } - - private: - const std::shared_ptr column_path_; - bool encrypted_; - std::string key_; - std::string key_metadata_; - - Builder(const std::shared_ptr& path, bool encrypted) - : column_path_(path), encrypted_(encrypted) {} - }; - - const std::shared_ptr& column_path() { return column_path_; } - bool is_encrypted() const { return encrypted_; } - bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } - const std::string& key() const { return key_; } - const std::string& key_metadata() const { return key_metadata_; } - - ColumnEncryptionProperties() = default; - ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; - ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; - - private: - const std::shared_ptr column_path_; - bool encrypted_; - bool encrypted_with_footer_key_; - std::string key_; - std::string key_metadata_; - explicit ColumnEncryptionProperties( - bool encrypted, const std::shared_ptr& column_path, - const std::string& key, const std::string& key_metadata); -}; - -class PARQUET_EXPORT ColumnDecryptionProperties { - public: - class Builder { - public: - // convenience builder for regular (not nested) columns. - explicit Builder(const std::string& name) - : Builder(schema::ColumnPath::FromDotString(name)) {} - - explicit Builder(const std::shared_ptr& path) - : column_path_(path) {} - - // Set an explicit column key. If applied on a file that contains - // key metadata for this column the metadata will be ignored, - // the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_)); - } - - private: - const std::shared_ptr column_path_; - std::string key_; - }; - - ColumnDecryptionProperties() = default; - ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default; - ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default; - - const std::shared_ptr& column_path() { return column_path_; } - const std::string& key() const { return key_; } - - private: - const std::shared_ptr column_path_; - std::string key_; - - // This class is only required for setting explicit column decryption keys - - // to override key retriever (or to provide keys when key metadata and/or - // key retriever are not available) - explicit ColumnDecryptionProperties( - const std::shared_ptr& column_path, const std::string& key); -}; - -class PARQUET_EXPORT AADPrefixVerifier { - public: - // Verifies identity (AAD Prefix) of individual file, - // or of file collection in a data set. - // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, - // and then checked for missing files. - virtual void check(const std::string& aad_prefix) = 0; -}; - -class PARQUET_EXPORT FileDecryptionProperties { - public: - class Builder { - public: - Builder() { - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; - } - - // Set an explicit footer key. If applied on a file that contains - // footer key metadata the metadata will be ignored, the footer - // will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from - // key retriever. - // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(const std::string& footer_key) { - if (footer_key.empty()) { - return this; - } - DCHECK(!footer_key.empty()); - footer_key_ = footer_key; - return this; - } - - // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - column_properties_ = column_properties; - return this; - } - - // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. - Builder* key_retriever(const std::shared_ptr& key_retriever) { - if (key_retriever == NULLPTR) return this; - - DCHECK(key_retriever_ == NULLPTR); - key_retriever_ = key_retriever; - return this; - } - - // Skip integrity verification of plaintext footers. - // If not called, integrity of plaintext footers will be checked in runtime, - // and an exception will be thrown in the following situations: - // - footer signing key is not available - // (not passed, or not found by key retriever) - // - footer content and signature don't match - Builder* disable_footer_signature_verification() { - check_plaintext_footer_integrity_ = false; - return this; - } - - // Explicitly supply the file AAD prefix. - // A must when a prefix is used for file encryption, but not stored in file. - // If AAD prefix is stored in file, it will be compared to the explicitly - // supplied value and an exception will be thrown if they differ. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) { - return this; - } - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - return this; - } - - // Set callback for verification of AAD Prefixes stored in file. - Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) return this; - - DCHECK(aad_prefix_verifier_ == NULLPTR); - aad_prefix_verifier_ = aad_prefix_verifier; - return this; - } - - // By default, reading plaintext (unencrypted) files is not - // allowed when using a decryptor - // - in order to detect files that were not encrypted by mistake. - // However, the default behavior can be overriden by calling this method. - // The caller should use then a different method to ensure encryption - // of files with sensitive data. - Builder* plaintext_files_allowed() { - plaintext_files_allowed_ = true; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr(new FileDecryptionProperties( - footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, - aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - bool plaintext_files_allowed_; - }; - - const std::string& column_key(const std::shared_ptr& column_path); - - const std::string& footer_key() { return footer_key_; } - - const std::string& aad_prefix() { return aad_prefix_; } - std::shared_ptr key_retriever() { return key_retriever_; } - - bool check_plaintext_footer_integrity() { return check_plaintext_footer_integrity_; } - - bool plaintext_files_allowed() { return plaintext_files_allowed_; } - - const std::shared_ptr& aad_prefix_verifier() { - return aad_prefix_verifier_; - } - - private: - std::string footer_key_; - std::string aad_prefix_; - std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; - - std::shared_ptr key_retriever_; - bool check_plaintext_footer_integrity_; - bool plaintext_files_allowed_; - - FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, - bool plaintext_files_allowed); -}; - -class PARQUET_EXPORT FileEncryptionProperties { - public: - class Builder { - public: - explicit Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { - footer_key_ = footer_key; - store_aad_prefix_in_file_ = false; - } - - // Create files with plaintext footer. - // If not called, the files will be created with encrypted footer (default). - Builder* enable_plaintext_footer() { - encrypted_footer_ = false; - return this; - } - - // Set encryption algorithm. - // If not called, files will be encrypted with AES_GCM_V1 (default). - Builder* algorithm(ParquetCipher::type parquet_cipher) { - parquet_cipher_ = parquet_cipher; - return this; - } - - // Set a key retrieval metadata (converted from String). - // use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_id(const std::string& key_id); - - // Set a key retrieval metadata. - // use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_metadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) return this; - - DCHECK(footer_key_metadata_.empty()); - footer_key_metadata_ = footer_key_metadata; - return this; - } - - // Set the file AAD Prefix. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) return this; - - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - store_aad_prefix_in_file_ = true; - return this; - } - - // Skip storing AAD Prefix in file. - // If not called, and if AAD Prefix is set, it will be stored. - Builder* disable_store_aad_prefix_storage() { - DCHECK(!aad_prefix_.empty()); - - store_aad_prefix_in_file_ = false; - return this; - } - - // Set the list of encrypted columns and their properties (keys etc). - // If not called, all columns will be encrypted with the footer key. - // If called, the file columns not in the list will be left unencrypted. - Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - column_properties_ = column_properties; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr(new FileEncryptionProperties( - parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, - aad_prefix_, store_aad_prefix_in_file_, column_properties_)); - } - - private: - ParquetCipher::type parquet_cipher_; - bool encrypted_footer_; - std::string footer_key_; - std::string footer_key_metadata_; - - std::string aad_prefix_; - bool store_aad_prefix_in_file_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; - }; - bool encrypted_footer() const { return encrypted_footer_; } - - const EncryptionAlgorithm algorithm() { return algorithm_; } - - const std::string& footer_encryption_key() { - return (encrypted_footer_ ? footer_key_ : NULL_STRING); - } - - const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); - } - - const std::string& footer_signing_key() { - return (encrypted_footer_ ? NULL_STRING : footer_key_); - } - - const std::string& footer_signing_key_metadata() { - return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); - } - - const std::string& file_aad() const { return file_aad_; } - - std::shared_ptr column_properties( - const std::shared_ptr& column_path); - - private: - EncryptionAlgorithm algorithm_; - std::string footer_key_; - std::string footer_key_metadata_; - bool encrypted_footer_; - std::string file_aad_; - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; - - FileEncryptionProperties( - ParquetCipher::type cipher, const std::string& footer_key, - const std::string& footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); -}; - -} // namespace parquet - -#endif // PARQUET_ENCRYPTION_PROPERTIES_H diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 7514d37d9245..a386f643175a 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -16,7 +16,7 @@ // under the License. #include "parquet/internal_file_decryptor.h" -#include "parquet/encryption_properties.h" +#include "parquet/encryption.h" #include "parquet/util/crypto.h" namespace parquet { diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index c2127f06dd12..f786fc9c797f 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -16,7 +16,7 @@ // under the License. #include "parquet/internal_file_encryptor.h" -#include "parquet/encryption_properties.h" +#include "parquet/encryption.h" #include "parquet/util/crypto.h" namespace parquet { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 712fde1ddb47..56aa7ed37e2a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -26,7 +26,7 @@ #include "arrow/type.h" -#include "parquet/encryption_properties.h" +#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" From 5c661ddd731f33e59a5b060f8ea8b9faac0fc8e8 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 21:45:14 +0300 Subject: [PATCH 055/201] Remove encryption_properties.cc from CMakeLists.txt --- cpp/src/parquet/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 820934230669..9e1638957e84 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -168,7 +168,6 @@ set(PARQUET_SRCS deprecated_io.cc encoding.cc encryption.cc - encryption_properties.cc internal_file_decryptor.cc internal_file_encryptor.cc file_reader.cc From b988c60378fe8df66dcb65d5e119320e847ae2c8 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 21:49:22 +0300 Subject: [PATCH 056/201] Add column_metadata_map_, column_data_map_, footer_signing_encryptor_ and footer_encryptor_ to InternalFileEncryptor --- cpp/src/parquet/internal_file_encryptor.cc | 56 ++++++++++++++++++---- cpp/src/parquet/internal_file_encryptor.h | 13 +++++ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index f786fc9c797f..cd0f648e7c2c 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -44,26 +44,49 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) - : properties_(properties) {} + : properties_(properties) { + column_data_map_ = std::shared_ptr, + std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr, + std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); +} std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { + if (footer_encryptor_ != NULLPTR) { + return footer_encryptor_; + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); + std::shared_ptr encryptor = std::make_shared( + aes_encryptor, footer_key, properties_->file_aad(), + aad); + footer_encryptor_ = encryptor; + return encryptor; - return std::make_shared(aes_encryptor, footer_key, properties_->file_aad(), - aad); } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { + if (footer_signing_encryptor_ != NULLPTR) { + return footer_signing_encryptor_; + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); - - return std::make_shared(aes_encryptor, footer_signing_key, - properties_->file_aad(), aad); + std::shared_ptr encryptor = std::make_shared( + aes_encryptor, footer_signing_key, + properties_->file_aad(), aad); + footer_signing_encryptor_ = encryptor; + return encryptor; } std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( @@ -79,6 +102,16 @@ std::shared_ptr InternalFileEncryptor::GetColumnDataEncryptor( std::shared_ptr InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata) { + // first look if we already got the encryptor from before + if (metadata) { + if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { + return column_metadata_map_->at(column_path); + } + } else { + if (column_data_map_->find(column_path) != column_data_map_->end()) { + return column_data_map_->at(column_path); + } + } auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { return NULLPTR; @@ -100,9 +133,14 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( : GetDataAesEncryptor(algorithm, key.size()); std::string file_aad = properties_->file_aad(); - - // TODO: aad - return std::make_shared(aes_encryptor, key, file_aad, ""); + std::shared_ptr encryptor = std::make_shared( + aes_encryptor, key, file_aad, ""); + if (metadata) + (*column_metadata_map_)[column_path] = encryptor; + else + (*column_data_map_)[column_path] = encryptor; + + return encryptor; } parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index ccef1315f603..fcc85c473ca3 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -20,6 +20,7 @@ #include #include +#include #include "parquet/schema.h" @@ -62,6 +63,18 @@ class InternalFileEncryptor { private: FileEncryptionProperties* properties_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; + + std::shared_ptr footer_signing_encryptor_; + std::shared_ptr footer_encryptor_; + std::unique_ptr meta_encryptor_128_; std::unique_ptr meta_encryptor_196_; std::unique_ptr meta_encryptor_256_; From 5382b02e18ac0b45ef24ac37de82fa763c719e77 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:30:51 +0300 Subject: [PATCH 057/201] Add column_data_map_, column_metadata_map_, footer_data_decryptor_ and footer_metadata_decryptor_ to InternalFileDecryptor --- cpp/src/parquet/internal_file_decryptor.cc | 74 ++++++++++++++++------ cpp/src/parquet/internal_file_decryptor.h | 21 ++++-- 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index a386f643175a..81c551154afc 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -67,11 +67,25 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, // InternalFileDecryptor InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, - const std::string& file_aad, - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) : properties_(properties), file_aad_(file_aad), - algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) {} + algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { + column_data_map_ = std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + } std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { @@ -100,7 +114,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); footer_signing_encryptor_ = - std::make_shared(algorithm_, footer_key, file_aad_, aad); + std::make_shared(algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } @@ -121,7 +135,12 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptorForColumnDat std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( const std::string& aad, bool metadata) { - if (footer_decryptor_ != NULLPTR) return footer_decryptor_; + if (metadata) { + if (footer_metadata_decryptor_ != NULLPTR) return footer_metadata_decryptor_; + } else { + if (footer_data_decryptor_ != NULLPTR) return footer_data_decryptor_; + } + std::string footer_key = properties_->footer_key(); if (footer_key.empty()) { if (footer_key_metadata_.empty()) @@ -144,9 +163,15 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) : GetDataAesDecryptor(footer_key.size()); - footer_decryptor_ = - std::make_shared(aes_decryptor, footer_key, file_aad_, aad); - return footer_decryptor_; + std::shared_ptr decryptor = + std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + + if (metadata) + footer_metadata_decryptor_ = decryptor; + else + footer_data_decryptor_ = decryptor; + + return decryptor; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( @@ -165,10 +190,17 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr column_path, const std::string& column_key_metadata, const std::string& aad, bool metadata) { std::string column_key; - // first look if we already got the key from before - if (column_map_ != NULLPTR && column_map_->find(column_path) != column_map_->end()) { - column_key = column_map_->at(column_path); + // first look if we already got the decryptor from before + if (metadata) { + if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { + return column_metadata_map_->at(column_path); + } } else { + if (column_data_map_->find(column_path) != column_data_map_->end()) { + return column_data_map_->at(column_path); + } + } + column_key = properties_->column_key(column_path); // No explicit column key given via API. Retrieve via key metadata. if (column_key.empty() && !column_key_metadata.empty() && @@ -182,21 +214,23 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( throw HiddenColumnException(ss.str()); } } - } if (column_key.empty()) { throw HiddenColumnException("HiddenColumnException, path=" + column_path->ToDotString()); } - if (column_map_ != NULLPTR) { - // save column key for future use - (*column_map_)[column_path] = column_key; - } - auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + : GetDataAesDecryptor(column_key.size()); + + std::shared_ptr decryptor = std::make_shared( + aes_decryptor, column_key, file_aad_, aad); + if (metadata) + (*column_metadata_map_)[column_path] = decryptor; + else + (*column_data_map_)[column_path] = decryptor; + + return decryptor; - return std::make_shared(aes_decryptor, column_key, file_aad_, aad); } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 320c3546ed4e..0b0c41789db8 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -71,9 +71,9 @@ class Decryptor { class InternalFileDecryptor { public: explicit InternalFileDecryptor(FileDecryptionProperties* properties, - const std::string& file_aad, - ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata); std::string& file_aad() { return file_aad_; } @@ -99,10 +99,17 @@ class InternalFileDecryptor { FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - // A map between ColumnPath and their encryption keys - std::shared_ptr, std::string, - parquet::schema::ColumnPath::CmpColumnPath>> - column_map_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; + + std::shared_ptr footer_metadata_decryptor_; + std::shared_ptr footer_data_decryptor_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_decryptor_; From 11e68f1315803391206430f77eb6bfcaceb019fb Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:38:52 +0300 Subject: [PATCH 058/201] Rename aad to update_aad in Encryptor and Decryptor classes --- cpp/src/parquet/column_reader.cc | 8 ++++---- cpp/src/parquet/column_writer.cc | 10 +++++----- cpp/src/parquet/internal_file_decryptor.h | 2 +- cpp/src/parquet/internal_file_encryptor.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 60912d55981c..16ab2067b8d2 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -230,10 +230,10 @@ std::shared_ptr SerializedPageReader::NextPage() { aad = parquet_encryption::createModuleAAD( meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); - meta_decryptor_->aad(aad); + meta_decryptor_->update_aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - meta_decryptor_->aad(data_page_headerAAD_); + meta_decryptor_->update_aad(data_page_headerAAD_); } } DeserializeThriftMsg(reinterpret_cast(buffer.data()), @@ -262,10 +262,10 @@ std::shared_ptr SerializedPageReader::NextPage() { aad = parquet_encryption::createModuleAAD( data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); - data_decryptor_->aad(aad); + data_decryptor_->update_aad(aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - data_decryptor_->aad(data_pageAAD_); + data_decryptor_->update_aad(data_pageAAD_); } } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index ee8a95537f81..cf357450bc28 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -194,7 +194,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { - data_encryptor_->aad( + data_encryptor_->update_aad( parquet_encryption::createModuleAAD(data_encryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, @@ -220,7 +220,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } @@ -242,7 +242,7 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); if (meta_encryptor_ != nullptr) { - meta_encryptor_->aad(parquet_encryption::createModuleAAD( + meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } @@ -289,7 +289,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); - data_encryptor_->aad(data_pageAAD_); + data_encryptor_->update_aad(data_pageAAD_); encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + output_data_len); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -312,7 +312,7 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); - meta_encryptor_->aad(data_page_headerAAD_); + meta_encryptor_->update_aad(data_page_headerAAD_); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 0b0c41789db8..733c448f4f7f 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -56,7 +56,7 @@ class Decryptor { const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } - void aad(const std::string& aad) { aad_ = aad; } + void update_aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index fcc85c473ca3..bfa7cca3e0dd 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -37,7 +37,7 @@ class Encryptor { Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } - void aad(const std::string& aad) { aad_ = aad; } + void update_aad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); From 9273de0747074073b0458b46f76d4bad1703a8d3 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:48:10 +0300 Subject: [PATCH 059/201] Move PARQUET_EMAGIC and PARQUET_MAGIC to file_writer.h and use it in file_reader.cc --- cpp/src/parquet/file_reader.cc | 3 +-- cpp/src/parquet/file_writer.cc | 16 ++++++---------- cpp/src/parquet/file_writer.h | 4 ++++ 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b92b1c91d57e..1894fbbfc94f 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,6 +29,7 @@ #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" +#include "parquet/file_writer.h" #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" @@ -47,8 +48,6 @@ namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; -static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; -static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; // For PARQUET-816 static constexpr int64_t kMaxDictHeaderSize = 100; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 5653a146488b..c917e230e20b 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -37,10 +37,6 @@ using parquet::schema::GroupNode; namespace parquet { -// FIXME: copied from reader-internal.cc -static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; -static constexpr uint8_t PARQUET_EMAGIC[4] = {'P', 'A', 'R', 'E'}; - // ---------------------------------------------------------------------- // RowGroupWriter public API @@ -308,7 +304,7 @@ class FileSerializer : public ParquetFileWriter::Contents { uint32_t footer_and_crypto_len = static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - sink_->Write(PARQUET_EMAGIC, 4); + sink_->Write(kParquetEMagic, 4); } else { // footer plain mode EncryptionAlgorithm signing_encryption; @@ -395,15 +391,15 @@ class FileSerializer : public ParquetFileWriter::Contents { auto file_encryption = properties_->file_encryption(); if (file_encryption == nullptr) { // Unencrypted parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); if (file_encryption->encrypted_footer()) { - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_EMAGIC, 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { // plaintext mode footer - PARQUET_THROW_NOT_OK(sink_->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } } @@ -455,7 +451,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); } else { if (encrypt_footer) { // encrypt and write to sink @@ -466,7 +462,7 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin metadata_len = static_cast(sink->Tell()) - metadata_len; sink->Write(reinterpret_cast(&metadata_len), 4); - sink->Write(PARQUET_MAGIC, 4); + sink->Write(kParquetMagic, 4); } } } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 0aad4fc5abc2..15b45d38377b 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -31,6 +31,10 @@ namespace parquet { class ColumnWriter; class OutputStream; +// FIXME: copied from reader-internal.cc +static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; +static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; + class PARQUET_EXPORT RowGroupWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more From 37461cae2630574c6c13508e9e7a52d9d4464d81 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 12 May 2019 22:57:29 +0300 Subject: [PATCH 060/201] Rename file_encryption to file_encryption_properties in file_writer.cc --- cpp/src/parquet/file_writer.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index c917e230e20b..c8b1b5e011be 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -286,12 +286,12 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - auto file_encryption = properties_->file_encryption(); - if (file_encryption == nullptr) { + auto file_encryption_properties = properties_->file_encryption(); + if (file_encryption_properties == nullptr) { file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); } else { - if (file_encryption->encrypted_footer()) { + if (file_encryption_properties->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -308,14 +308,14 @@ class FileSerializer : public ParquetFileWriter::Contents { } else { // footer plain mode EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption->algorithm(); + EncryptionAlgorithm algo = file_encryption_properties->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; if (!algo.aad.supply_aad_prefix) signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, file_encryption->footer_signing_key_metadata()); + &signing_encryption, file_encryption_properties->footer_signing_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); @@ -388,13 +388,13 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr file_encryptor_; void StartFile() { - auto file_encryption = properties_->file_encryption(); - if (file_encryption == nullptr) { + auto file_encryption_properties = properties_->file_encryption(); + if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { - file_encryptor_.reset(new InternalFileEncryptor(file_encryption)); - if (file_encryption->encrypted_footer()) { + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties)); + if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { From bba24b8bbdc499f71ceff9b38d1c2640119f349e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 13 May 2019 07:07:45 +0300 Subject: [PATCH 061/201] Remove unused footer_decryptor_ from InternalFileDecryptor class and fix indentation in GetFooterSigningEncryptor --- cpp/src/parquet/internal_file_decryptor.cc | 7 ++++--- cpp/src/parquet/internal_file_decryptor.h | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 81c551154afc..0835f0686bef 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -89,7 +89,8 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { - if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; + if (footer_signing_encryptor_ != NULLPTR) + return footer_signing_encryptor_; std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -113,8 +114,8 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); - footer_signing_encryptor_ = - std::make_shared(algorithm_, footer_key, file_aad_, aad); + footer_signing_encryptor_ = std::make_shared( + algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 733c448f4f7f..3abdae2f493b 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -112,7 +112,6 @@ class InternalFileDecryptor { std::shared_ptr footer_data_decryptor_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; - std::shared_ptr footer_decryptor_; std::shared_ptr footer_signing_encryptor_; std::unique_ptr meta_decryptor_128_; From 4bb0238dc91aaac94e566125b67a0695813454c1 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 10:44:53 +0300 Subject: [PATCH 062/201] Fix format --- cpp/src/parquet/file_reader.cc | 21 ++++++++++----------- cpp/src/parquet/file_writer.cc | 3 ++- cpp/src/parquet/internal_file_decryptor.cc | 21 ++++++++++----------- cpp/src/parquet/internal_file_encryptor.cc | 1 - 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1894fbbfc94f..1f976452d673 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -260,11 +260,11 @@ class SerializedFile : public ParquetFileReader::Contents { auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file - if (file_decryption_properties != NULLPTR) { - if (!file_decryption_properties->plaintext_files_allowed()) { - throw ParquetException("Applying decryption properties on plaintext file"); - } - } + if (file_decryption_properties != NULLPTR) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } } else { if (file_decryption_properties == NULLPTR) { throw ParquetException("No decryption properties are provided"); @@ -294,10 +294,11 @@ class SerializedFile : public ParquetFileReader::Contents { } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, - file_aad, algo.algorithm, - file_metadata_->footer_signing_key_metadata())); - + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, + file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata())); + if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException( @@ -375,8 +376,6 @@ class SerializedFile : public ParquetFileReader::Contents { file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, file_crypto_metadata->key_metadata())); - - int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index c8b1b5e011be..42810bef100a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -315,7 +315,8 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, file_encryption_properties->footer_signing_key_metadata()); + &signing_encryption, + file_encryption_properties->footer_signing_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 0835f0686bef..03870baa0f4b 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -73,18 +73,18 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, : properties_(properties), file_aad_(file_aad), algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { column_data_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); column_metadata_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); } std::shared_ptr @@ -231,7 +231,6 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( (*column_data_map_)[column_path] = decryptor; return decryptor; - } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index cd0f648e7c2c..3801cf6c70be 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -71,7 +71,6 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { aad); footer_encryptor_ = encryptor; return encryptor; - } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { From a1f70393c1188b2b86137aaf256ea88f28b1985c Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 11:36:37 +0300 Subject: [PATCH 063/201] Change implementation of NULL_STRING --- cpp/src/parquet/encryption.cc | 2 +- cpp/src/parquet/encryption.h | 13 ++++++++----- cpp/src/parquet/metadata.cc | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index f7c4ab30cebe..7f3effb2540a 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -104,7 +104,7 @@ const std::string& FileDecryptionProperties::column_key( return column_prop->key(); } } - return NULL_STRING; + return empty_string_; } FileDecryptionProperties::FileDecryptionProperties( diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index a54c4bb26ab6..329845edf823 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -33,7 +33,6 @@ namespace parquet { -static const std::string NULL_STRING = ""; static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = ParquetCipher::AES_GCM_V1; static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; @@ -364,6 +363,8 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; + const std::string empty_string_ = ""; + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_properties_; @@ -481,19 +482,19 @@ class PARQUET_EXPORT FileEncryptionProperties { const EncryptionAlgorithm algorithm() { return algorithm_; } const std::string& footer_encryption_key() { - return (encrypted_footer_ ? footer_key_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_ : empty_string_); } const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_ ? footer_key_metadata_ : NULL_STRING); + return (encrypted_footer_ ? footer_key_metadata_ : empty_string_); } const std::string& footer_signing_key() { - return (encrypted_footer_ ? NULL_STRING : footer_key_); + return (encrypted_footer_ ? empty_string_ : footer_key_); } const std::string& footer_signing_key_metadata() { - return (encrypted_footer_ ? NULL_STRING : footer_key_metadata_); + return (encrypted_footer_ ? empty_string_ : footer_key_metadata_); } const std::string& file_aad() const { return file_aad_; } @@ -508,6 +509,8 @@ class PARQUET_EXPORT FileEncryptionProperties { bool encrypted_footer_; std::string file_aad_; + const std::string empty_string_ = ""; + std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_properties_; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 355e01d7b601..422bc922f13c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1204,7 +1204,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->file_encryption() != nullptr && - props->file_encryption()->footer_signing_key() == NULL_STRING) { + props->file_encryption()->footer_signing_key().empty()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } From b8f5fba6854d3b7d4ba0a4d21939c908e66c7b69 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 11:45:01 +0300 Subject: [PATCH 064/201] Change ParquetException message format in file_reader.cc --- cpp/src/parquet/file_reader.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1f976452d673..fe3b8a20f366 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -277,8 +277,7 @@ class SerializedFile : public ParquetFileReader::Contents { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { throw ParquetException( - "ADD Prefix in file and " - "in properties is not the same"); + "ADD Prefix in file and in properties is not the same"); } } aad_prefix = algo.aad.aad_prefix; @@ -288,9 +287,8 @@ class SerializedFile : public ParquetFileReader::Contents { } if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + "AAD prefix used for file encryption, but not stored in file" + "and not supplied in decryption properties"); } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; @@ -302,16 +300,14 @@ class SerializedFile : public ParquetFileReader::Contents { if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { throw ParquetException( - "Invalid parquet file. Cannot verify plaintext" - "mode footer."); + "Invalid parquet file. Cannot verify plaintext mode footer."); } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); if (!file_metadata_->verify(encryptor, metadata_buffer->data() + read_metadata_len)) { - throw ParquetException( - "Invalid parquet file. Could not verify plaintext" - " footer metadata"); + throw ParquetException("Invalid parquet file. Could not verify plaintext " + "footer metadata"); } } } From 5d023b242b2ace17060ccbdeb94aa53a2833163f Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 13 May 2019 12:12:38 +0300 Subject: [PATCH 065/201] Make format --- cpp/src/parquet/encryption.cc | 6 +- cpp/src/parquet/encryption.h | 8 +-- cpp/src/parquet/file_reader.cc | 39 ++++++----- cpp/src/parquet/internal_file_decryptor.cc | 79 +++++++++++----------- cpp/src/parquet/internal_file_decryptor.h | 16 ++--- cpp/src/parquet/internal_file_encryptor.cc | 32 ++++----- cpp/src/parquet/internal_file_encryptor.h | 18 ++--- 7 files changed, 96 insertions(+), 102 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 7f3effb2540a..c53de6746879 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -17,10 +17,10 @@ #include "parquet/encryption.h" -#include -#include #include +#include #include +#include #include "arrow/util/utf8.h" @@ -114,7 +114,7 @@ FileDecryptionProperties::FileDecryptionProperties( std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, + schema::ColumnPath::CmpColumnPath>& column_properties, bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 329845edf823..2c969bde2a9f 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -19,8 +19,8 @@ #define PARQUET_ENCRYPTION_H #include -#include #include +#include #include #include "arrow/util/logging.h" @@ -30,7 +30,6 @@ #include "parquet/types.h" #include "parquet/util/visibility.h" - namespace parquet { static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = @@ -318,7 +317,7 @@ class PARQUET_EXPORT FileDecryptionProperties { // The caller should use then a different method to ensure encryption // of files with sensitive data. Builder* plaintext_files_allowed() { - plaintext_files_allowed_ = true; + plaintext_files_allowed_ = true; return this; } @@ -380,7 +379,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier, const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, + schema::ColumnPath::CmpColumnPath>& column_properties, bool plaintext_files_allowed); }; @@ -524,7 +523,6 @@ class PARQUET_EXPORT FileEncryptionProperties { schema::ColumnPath::CmpColumnPath>& column_properties); }; - } // namespace parquet #endif // PARQUET_ENCRYPTION_H diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index fe3b8a20f366..f80f793a6a26 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,11 +29,11 @@ #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" -#include "parquet/file_writer.h" #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" #include "parquet/exception.h" +#include "parquet/file_writer.h" #include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" @@ -59,9 +59,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -71,9 +71,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } @@ -193,8 +193,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), i, properties_, - file_decryptor_.get())); + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -259,7 +258,7 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); auto file_decryption_properties = properties_.file_decryption_properties(); - if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file + if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file if (file_decryption_properties != NULLPTR) { if (!file_decryption_properties->plaintext_files_allowed()) { throw ParquetException("Applying decryption properties on plaintext file"); @@ -293,8 +292,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::string file_aad = aad_prefix + algo.aad.aad_file_unique; file_decryptor_.reset(new InternalFileDecryptor( - file_decryption_properties, - file_aad, algo.algorithm, + file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata())); if (file_decryption_properties->check_plaintext_footer_integrity()) { @@ -306,8 +304,9 @@ class SerializedFile : public ParquetFileReader::Contents { auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); if (!file_metadata_->verify(encryptor, metadata_buffer->data() + read_metadata_len)) { - throw ParquetException("Invalid parquet file. Could not verify plaintext " - "footer metadata"); + throw ParquetException( + "Invalid parquet file. Could not verify plaintext " + "footer metadata"); } } } @@ -369,9 +368,9 @@ class SerializedFile : public ParquetFileReader::Contents { "in decryption properties"); } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, - file_aad, algo.algorithm, - file_crypto_metadata->key_metadata())); + file_decryptor_.reset( + new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata())); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; @@ -479,9 +478,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) - << "The file only has " << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) << "The file only has " + << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 03870baa0f4b..6bc93a33884a 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -66,31 +66,30 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, } // InternalFileDecryptor - InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, - const std::string& file_aad, - ParquetCipher::type algorithm, - const std::string& footer_key_metadata) - : properties_(properties), file_aad_(file_aad), - algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { - column_data_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - } +InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, + const std::string& file_aad, + ParquetCipher::type algorithm, + const std::string& footer_key_metadata) + : properties_(properties), + file_aad_(file_aad), + algorithm_(algorithm), + footer_key_metadata_(footer_key_metadata) { + column_data_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); +} std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { - if (footer_signing_encryptor_ != NULLPTR) - return footer_signing_encryptor_; + if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -114,8 +113,8 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { std::string aad = parquet_encryption::createFooterAAD(file_aad_); - footer_signing_encryptor_ = std::make_shared( - algorithm_, footer_key, file_aad_, aad); + footer_signing_encryptor_ = + std::make_shared(algorithm_, footer_key, file_aad_, aad); return footer_signing_encryptor_; } @@ -165,7 +164,7 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) : GetDataAesDecryptor(footer_key.size()); std::shared_ptr decryptor = - std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_decryptor, footer_key, file_aad_, aad); if (metadata) footer_metadata_decryptor_ = decryptor; @@ -202,29 +201,29 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( } } - column_key = properties_->column_key(column_path); - // No explicit column key given via API. Retrieve via key metadata. - if (column_key.empty() && !column_key_metadata.empty() && - properties_->key_retriever() != nullptr) { - try { - column_key = properties_->key_retriever()->GetKey(column_key_metadata); - } catch (KeyAccessDeniedException& e) { - std::stringstream ss; - ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " - << e.what() << "\n"; - throw HiddenColumnException(ss.str()); - } + column_key = properties_->column_key(column_path); + // No explicit column key given via API. Retrieve via key metadata. + if (column_key.empty() && !column_key_metadata.empty() && + properties_->key_retriever() != nullptr) { + try { + column_key = properties_->key_retriever()->GetKey(column_key_metadata); + } catch (KeyAccessDeniedException& e) { + std::stringstream ss; + ss << "HiddenColumnException, path=" + column_path->ToDotString() + " " << e.what() + << "\n"; + throw HiddenColumnException(ss.str()); } + } if (column_key.empty()) { throw HiddenColumnException("HiddenColumnException, path=" + column_path->ToDotString()); } auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + : GetDataAesDecryptor(column_key.size()); - std::shared_ptr decryptor = std::make_shared( - aes_decryptor, column_key, file_aad_, aad); + std::shared_ptr decryptor = + std::make_shared(aes_decryptor, column_key, file_aad_, aad); if (metadata) (*column_metadata_map_)[column_path] = decryptor; else diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 3abdae2f493b..1e0a587b6f90 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -99,14 +99,14 @@ class InternalFileDecryptor { FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_data_map_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_metadata_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; std::shared_ptr footer_metadata_decryptor_; std::shared_ptr footer_data_decryptor_; diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 3801cf6c70be..968b165785d9 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -45,17 +45,17 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { - column_data_map_ = std::shared_ptr, - std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr, - std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + column_data_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); + + column_metadata_map_ = std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>>( + new std::map, std::shared_ptr, + schema::ColumnPath::CmpColumnPath>()); } std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { @@ -67,8 +67,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_key, properties_->file_aad(), - aad); + aes_encryptor, footer_key, properties_->file_aad(), aad); footer_encryptor_ = encryptor; return encryptor; } @@ -82,8 +81,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_signing_key, - properties_->file_aad(), aad); + aes_encryptor, footer_signing_key, properties_->file_aad(), aad); footer_signing_encryptor_ = encryptor; return encryptor; } @@ -132,8 +130,8 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( : GetDataAesEncryptor(algorithm, key.size()); std::string file_aad = properties_->file_aad(); - std::shared_ptr encryptor = std::make_shared( - aes_encryptor, key, file_aad, ""); + std::shared_ptr encryptor = + std::make_shared(aes_encryptor, key, file_aad, ""); if (metadata) (*column_metadata_map_)[column_path] = encryptor; else diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index bfa7cca3e0dd..7fcb943735cf 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -18,9 +18,9 @@ #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H +#include #include #include -#include #include "parquet/schema.h" @@ -63,14 +63,14 @@ class InternalFileEncryptor { private: FileEncryptionProperties* properties_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_data_map_; - std::shared_ptr, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> - column_metadata_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_data_map_; + std::shared_ptr< + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath>> + column_metadata_map_; std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; From 04cda18feedbf18bff9c336c3d79c35301997c95 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 10:54:11 +0300 Subject: [PATCH 066/201] Add comments to encryption-reader-writer.cc example --- .../low-level-api/encryption-reader-writer.cc | 148 ++++++++++-------- 1 file changed, 81 insertions(+), 67 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index e0d44d60667e..a75c0352cdab 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -22,24 +22,22 @@ #include -/* - * This example describes writing and reading Parquet Files in C++ and serves as a - * reference to the API. - * The file contains all the physical data types supported by Parquet. - * This example uses the RowGroupWriter API that supports writing RowGroups optimized for - *memory consumption - **/ -/* Parquet is a structured columnar file format - * Parquet File = "Parquet data" + "Parquet Metadata" - * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a - * columnar layout - * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their - * Columns - * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a - * complex (nested) type (internal nodes) - * For specific details, please refer the format here: - * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + +/* + * This example describes writing and reading Parquet Files in C++ with encrypted columns + * and serves as a reference to the Parquet Modular Encryption API. + * + * A detailed description of the Parquet Modular Encryption specification can be found here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The example contains writing and reading eight columns with the following four different + * encryption configurations: + * 1) uniform encryption - footer and all columns are encrypted with footer key. + * 2) non-uniform encryption - footer and ba_field column are encrypted with different keys. + * 3) plaintext footer mode where all columns are encrypted with footer key. + * 4) footer and ba_field column are encrypted with different keys. no column key + * is provided upon decryption and thus HiddenColumnException is thrown. **/ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; @@ -49,71 +47,83 @@ const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes int main(int argc, char** argv) { - std::vector> file_encryption_properties; - std::vector> file_decryption_properties; + std::vector> vector_of_encryption_configurations; + std::vector> vector_of_decryption_configurations; - // uniform encryption + // encryption configuration #1 - uniform encryption - all columns and footer are + // encrypted with footer key. parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder decryption_properties_builder_1; - decryption_properties_builder_1.footer_key(FOOTER_ENCRYPTION_KEY); - - // non-uniform with column keys + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_1.build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_1 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->build()); + + // encryption configuration #2 - footer and ba_field column are encrypted with + // different keys. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); - encryption_col_builder_0.key(COLUMN_ENCRYPTION_KEY); - auto encryption_col0 = encryption_col_builder_0.build(); - encryption_cols[path_ptr] = encryption_col0; - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); - decryption_col_builder2.key(COLUMN_ENCRYPTION_KEY); - decryption_cols[path_ptr] = decryption_col_builder2.build(); - - file_encryption_builder_2.column_properties(encryption_cols); + encryption_cols[path_ptr] = encryption_col_builder_0. + key(COLUMN_ENCRYPTION_KEY) + ->build(); - parquet::FileDecryptionProperties::Builder decryption_properties_builder_2; - decryption_properties_builder_2.footer_key(FOOTER_ENCRYPTION_KEY); - decryption_properties_builder_2.column_properties(decryption_cols); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); + decryption_cols[path_ptr] = decryption_col_builder2. + key(COLUMN_ENCRYPTION_KEY) + ->build(); - // plain mode footer = unencrypted footer + parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_2 + .column_properties(encryption_cols) + ->build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_2 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->column_properties(decryption_cols) + ->build()); + + // encryption configuration #3 - plain mode footer parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); - file_encryption_builder_3.enable_plaintext_footer(); - - parquet::FileDecryptionProperties::Builder decryption_properties_builder_3; - decryption_properties_builder_3.footer_key(FOOTER_ENCRYPTION_KEY); - - // plaintext mode footer, hidden column + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_3 + .set_plaintext_footer() + ->build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_3 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->build()); + + // encryption configuration #4 - footer and ba_field column are encrypted with different keys. + // no column key is provided upon decryption and thus HiddenColumnException is thrown. parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); + parquet::FileDecryptionProperties::Builder file_decryption_builder_4; - file_encryption_builder_4.enable_plaintext_footer(); - file_encryption_builder_4.column_properties(encryption_cols); // reusing encryption_cols - parquet::FileDecryptionProperties::Builder decryption_properties_builder_4; - decryption_properties_builder_4.footer_key(FOOTER_ENCRYPTION_KEY); + // Add the properties to the appropriate configurations vectors + vector_of_encryption_configurations.push_back(file_encryption_builder_4 + .set_plaintext_footer() + ->column_properties(encryption_cols) + ->build()); - file_encryption_properties.push_back(file_encryption_builder_1.build()); - file_encryption_properties.push_back(file_encryption_builder_2.build()); - file_encryption_properties.push_back(file_encryption_builder_3.build()); - file_encryption_properties.push_back(file_encryption_builder_4.build()); + vector_of_decryption_configurations.push_back(file_decryption_builder_4 + .footer_key(FOOTER_ENCRYPTION_KEY) + ->build()); - file_decryption_properties.push_back(decryption_properties_builder_1.build()); - file_decryption_properties.push_back(decryption_properties_builder_2.build()); - file_decryption_properties.push_back(decryption_properties_builder_3.build()); - file_decryption_properties.push_back(decryption_properties_builder_4.build()); - - for (unsigned example_id = 0; example_id < file_encryption_properties.size(); ++example_id) { + for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); ++example_id) { /********************************************************************************** PARQUET WRITER EXAMPLE **********************************************************************************/ - // parquet::REQUIRED fields do not need definition and repetition level values - // parquet::OPTIONAL fields require only definition level values - // parquet::REPEATED fields require both definition and repetition level values - // setup for encryption + try { // Create a local file output stream instance. @@ -127,7 +137,9 @@ int main(int argc, char** argv) { // Add writer properties parquet::WriterProperties::Builder builder; builder.compression(parquet::Compression::SNAPPY); - builder.encryption(file_encryption_properties[example_id]); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(vector_of_encryption_configurations[example_id]); std::shared_ptr props = builder.build(); @@ -243,7 +255,9 @@ int main(int argc, char** argv) { try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption_properties(file_decryption_properties[example_id]); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties(vector_of_decryption_configurations[example_id]); // Create a ParquetReader instance std::unique_ptr parquet_reader = From 5d7b27121fb363c53776f869bbe41f8672df2fb0 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 10:56:21 +0300 Subject: [PATCH 067/201] Rename enable_plaintext_footer to set_plaintext_footer --- cpp/src/parquet/encryption.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 2c969bde2a9f..e1f66b7fa8e6 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -396,7 +396,7 @@ class PARQUET_EXPORT FileEncryptionProperties { // Create files with plaintext footer. // If not called, the files will be created with encrypted footer (default). - Builder* enable_plaintext_footer() { + Builder* set_plaintext_footer() { encrypted_footer_ = false; return this; } From d676693d9d68613bb0515c8af1005a88b85ab5be Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:03:16 +0300 Subject: [PATCH 068/201] Rename aad variable in NextPage function --- cpp/src/parquet/column_reader.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 16ab2067b8d2..a4fec0d69940 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -210,7 +210,6 @@ std::shared_ptr SerializedPageReader::NextPage() { while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; uint32_t allowed_page_size = kDefaultPageHeaderSize; - std::string aad; // Page headers can be very large because of page statistics // We try to deserialize a larger buffer progressively @@ -227,10 +226,11 @@ std::shared_ptr SerializedPageReader::NextPage() { try { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD( + std::string dictionary_page_header_aad; + dictionary_page_header_aad = parquet_encryption::createModuleAAD( meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); - meta_decryptor_->update_aad(aad); + meta_decryptor_->update_aad(dictionary_page_header_aad); } else { parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); meta_decryptor_->update_aad(data_page_headerAAD_); @@ -259,10 +259,11 @@ std::shared_ptr SerializedPageReader::NextPage() { if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); if (current_page_is_dictionary) { - aad = parquet_encryption::createModuleAAD( + std::string dictionary_page_aad; + dictionary_page_aad = parquet_encryption::createModuleAAD( data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); - data_decryptor_->update_aad(aad); + data_decryptor_->update_aad(dictionary_page_aad); } else { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); data_decryptor_->update_aad(data_pageAAD_); From 59d4abb9f0982a3434279a42202e02a2c3d7d57d Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:07:26 +0300 Subject: [PATCH 069/201] Change comment in GetColumnPageReader --- cpp/src/parquet/file_reader.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index f80f793a6a26..c49fbbaa04d0 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -128,8 +128,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { bool encrypted = true; - // file is unencrypted - // or file is encrypted but column is unencrypted + // Column is encrypted only if crypto_metadata exists. if (!crypto_metadata) { encrypted = false; } From a36cbf88d94cffc368b13f6195fb18d1667de8a2 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:10:58 +0300 Subject: [PATCH 070/201] Change additional comments in GetColumnPageReader --- cpp/src/parquet/file_reader.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index c49fbbaa04d0..8a8d2966e4e7 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -139,9 +139,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { (int16_t)i/* column_ordinal */, properties_.memory_pool()); } - // the column is encrypted + // The column is encrypted - // the column is encrypted with footer key + // The column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); @@ -151,8 +151,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.memory_pool(), meta_decryptor, data_decryptor); } - // file is encrypted and the column is encrypted with its own key - + // The column is encrypted with its own key std::string column_key_metadata = crypto_metadata->key_metadata(); std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); From 7810db30ab791a7d16ef63d179d18a17b316d8c4 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:18:20 +0300 Subject: [PATCH 071/201] Add comments in file_writer.cc --- cpp/src/parquet/file_writer.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 42810bef100a..bd2a77cd5bab 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -287,10 +287,12 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption(); - if (file_encryption_properties == nullptr) { + + if (file_encryption_properties == nullptr) { // plaintext regular file file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); - } else { + } else { // Encrypted file + //Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -305,8 +307,7 @@ class FileSerializer : public ParquetFileWriter::Contents { static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); - } else { - // footer plain mode + } else { // Encrypted file with plaintext footer EncryptionAlgorithm signing_encryption; EncryptionAlgorithm algo = file_encryption_properties->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; @@ -440,7 +441,7 @@ std::unique_ptr ParquetFileWriter::Open( void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, bool encrypt_footer) { - if (encryptor == nullptr) { + if (encryptor == nullptr) { // plaintext regular file // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); @@ -453,11 +454,11 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { - if (encrypt_footer) { + } else { // Encrypted file + if (encrypt_footer) { // Encrypted file with encrypted footer // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); - } else { + } else { // Encrypted file with plaintext footer uint32_t metadata_len = static_cast(sink->Tell()); file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; From ab94416d0033b71cb345e2c96d8fd0e215d599b1 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:33:30 +0300 Subject: [PATCH 072/201] Create both data and metadata decryptors to avoid redundant retrieval of key from the key_retriever --- cpp/src/parquet/internal_file_decryptor.cc | 46 +++++++++++++--------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 6bc93a33884a..e49d66ce4cc1 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -161,17 +161,22 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( "Could not parse footer metadata"); } - auto aes_decryptor = metadata ? GetMetaAesDecryptor(footer_key.size()) - : GetDataAesDecryptor(footer_key.size()); - std::shared_ptr decryptor = - std::make_shared(aes_decryptor, footer_key, file_aad_, aad); + // Create both data and metadata decryptors to avoid redundant retrieval of key + // from the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); - if (metadata) - footer_metadata_decryptor_ = decryptor; - else - footer_data_decryptor_ = decryptor; + std::shared_ptr footer_metadata_decryptor = + std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); + std::shared_ptr footer_data_decryptor = + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); + + footer_metadata_decryptor_ = footer_metadata_decryptor; + footer_data_decryptor_ = footer_data_decryptor; - return decryptor; + if (metadata) + return footer_metadata_decryptor; + return footer_data_decryptor; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( @@ -219,17 +224,22 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( column_path->ToDotString()); } - auto aes_decryptor = metadata ? GetMetaAesDecryptor(column_key.size()) - : GetDataAesDecryptor(column_key.size()); + // Create both data and metadata decryptors to avoid redundant retrieval of key + // using the key_retriever. + auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); + auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); - std::shared_ptr decryptor = - std::make_shared(aes_decryptor, column_key, file_aad_, aad); - if (metadata) - (*column_metadata_map_)[column_path] = decryptor; - else - (*column_data_map_)[column_path] = decryptor; + std::shared_ptr metadata_decryptor = + std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); + std::shared_ptr data_decryptor = + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); + + (*column_metadata_map_)[column_path] = metadata_decryptor; + (*column_data_map_)[column_path] = data_decryptor; - return decryptor; + if (metadata) + return metadata_decryptor; + return data_decryptor; } parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( From 0a8e030e36bcd2398bdbab4ce13dbee7f3e2be6d Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:38:44 +0300 Subject: [PATCH 073/201] Fix metadata parameter sent to parquet_encryption::AesDecryptor --- cpp/src/parquet/internal_file_decryptor.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index e49d66ce4cc1..d08b807c9adf 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -248,19 +248,19 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { meta_decryptor_128_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { meta_decryptor_196_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { meta_decryptor_256_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); } return meta_decryptor_256_.get(); } From ac5a96db855a29717bf5d51c7be4db291913def9 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:41:47 +0300 Subject: [PATCH 074/201] Rename aad in GetFooterEncryptor and GetFooterSigningEncryptor --- cpp/src/parquet/internal_file_encryptor.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 968b165785d9..ac4b6aee3888 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -63,11 +63,11 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { return footer_encryptor_; } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_key = properties_->footer_encryption_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_key, properties_->file_aad(), aad); + aes_encryptor, footer_key, properties_->file_aad(), footer_aad); footer_encryptor_ = encryptor; return encryptor; } @@ -77,11 +77,11 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { return footer_signing_encryptor_; } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_signing_key = properties_->footer_signing_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_signing_key, properties_->file_aad(), aad); + aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); footer_signing_encryptor_ = encryptor; return encryptor; } From 1bc3329485dfd2bacae002d72f11af503d0e15eb Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:49:13 +0300 Subject: [PATCH 075/201] Rename verify to verify_signature --- cpp/src/parquet/file_reader.cc | 5 +++-- cpp/src/parquet/metadata.cc | 7 ++++--- cpp/src/parquet/metadata.h | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8a8d2966e4e7..65f3ba3fa63e 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -300,8 +300,9 @@ class SerializedFile : public ParquetFileReader::Contents { } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify(encryptor, - metadata_buffer->data() + read_metadata_len)) { + if (!file_metadata_->verify_signature(encryptor, + metadata_buffer->data() + + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " "footer metadata"); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 422bc922f13c..ee4dbce87670 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -460,7 +460,8 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } - bool verify(std::shared_ptr encryptor, const void* tail) { + bool verify_signature(std::shared_ptr encryptor, + const void* tail) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -633,9 +634,9 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify(std::shared_ptr encryptor, +bool FileMetaData::verify_signature(std::shared_ptr encryptor, const void* tail) { - return impl_->verify(encryptor, tail); + return impl_->verify_signature(encryptor, tail); } uint32_t FileMetaData::size() const { return impl_->size(); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index f7dc0e09cb41..4e10c63465a6 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -201,7 +201,7 @@ class PARQUET_EXPORT FileMetaData { ~FileMetaData(); - bool verify(std::shared_ptr encryptor, const void* tail); + bool verify_signature(std::shared_ptr encryptor, const void* tail); // file metadata uint32_t size() const; From 00e68ab5b1476f65d63a62fd8104208dd7f2c82d Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:53:58 +0300 Subject: [PATCH 076/201] Add comments to void WriteTo --- cpp/src/parquet/metadata.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ee4dbce87670..2c79b8c20da1 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -508,6 +508,8 @@ class FileMetaData::FileMetaDataImpl { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; + // Only in encrypted files with plaintext footers the + // encryption_algorithm is set in footer if (is_encryption_algorithm_set()) { uint8_t* serialized_data; uint32_t serialized_len; @@ -526,7 +528,8 @@ class FileMetaData::FileMetaDataImpl { // write tag dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, parquet_encryption::GCMTagLength); - } else { + } else { // either plaintext file (when encryptor is null) + // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); } } From b951882a9e5a0f1fc24befd739169af091473b40 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 11:57:19 +0300 Subject: [PATCH 077/201] Add additional comment in void WriteTo --- cpp/src/parquet/metadata.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 2c79b8c20da1..6a99d5b5e58e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -523,9 +523,8 @@ class FileMetaData::FileMetaDataImpl { // write unencrypted footer dst->Write(serialized_data, serialized_len); - // write nonce + // Write signature (nonce and tag) dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); - // write tag dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, parquet_encryption::GCMTagLength); } else { // either plaintext file (when encryptor is null) From fb38044f657df478aee810778ff18cb189e9e69e Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:08:26 +0300 Subject: [PATCH 078/201] Rename file_encryption to file_encryption_properties in WriterProperties --- cpp/src/parquet/file_writer.cc | 4 ++-- cpp/src/parquet/metadata.cc | 18 +++++++++--------- cpp/src/parquet/properties.h | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index bd2a77cd5bab..759214f6b5eb 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -286,7 +286,7 @@ class FileSerializer : public ParquetFileWriter::Contents { row_group_writer_.reset(); // Write magic bytes and metadata - auto file_encryption_properties = properties_->file_encryption(); + auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // plaintext regular file file_metadata_ = metadata_->Finish(); @@ -390,7 +390,7 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr file_encryptor_; void StartFile() { - auto file_encryption_properties = properties_->file_encryption(); + auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 6a99d5b5e58e..f06fe135ab94 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -950,8 +950,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - // TODO: check file_encryption() is null or not - auto footer_key = properties_->file_encryption()->footer_encryption_key(); + // TODO: check file_encryption_properties() is null or not + auto footer_key = properties_->file_encryption_properties()->footer_encryption_key(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key @@ -1206,8 +1206,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); - if (props->file_encryption() != nullptr && - props->file_encryption()->footer_signing_key().empty()) { + if (props->file_encryption_properties() != nullptr && + props->file_encryption_properties()->footer_signing_key().empty()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } @@ -1288,14 +1288,14 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return nullptr; } - auto file_encryption = properties_->file_encryption(); + auto file_encryption_properties = properties_->file_encryption_properties(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption->algorithm())); + crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption_properties->algorithm())); std::string key_metadata; - if (file_encryption->encrypted_footer()) - key_metadata = file_encryption->footer_encryption_key_metadata(); + if (file_encryption_properties->encrypted_footer()) + key_metadata = file_encryption_properties->footer_encryption_key_metadata(); else - key_metadata = file_encryption->footer_signing_key_metadata(); + key_metadata = file_encryption_properties->footer_signing_key_metadata(); if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 56aa7ed37e2a..4caa07e5a35c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -284,8 +284,8 @@ class PARQUET_EXPORT WriterProperties { } Builder* encryption( - const std::shared_ptr& file_encryption) { - file_encryption_ = file_encryption; + const std::shared_ptr& file_encryption_properties) { + file_encryption_properties_ = file_encryption_properties; return this; } @@ -336,7 +336,7 @@ class PARQUET_EXPORT WriterProperties { return std::shared_ptr(new WriterProperties( pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, - pagesize_, version_, created_by_, std::move(file_encryption_), + pagesize_, version_, created_by_, std::move(file_encryption_properties_), default_column_properties_, column_properties)); } @@ -348,7 +348,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; - std::shared_ptr file_encryption_; + std::shared_ptr file_encryption_properties_; // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -372,8 +372,8 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } - inline FileEncryptionProperties* file_encryption() const { - return file_encryption_.get(); + inline FileEncryptionProperties* file_encryption_properties() const { + return file_encryption_properties_.get(); } inline Encoding::type dictionary_index_encoding() const { @@ -421,8 +421,8 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr column_encryption_props( const std::shared_ptr& path) const { - if (file_encryption_) { - return file_encryption_->column_properties(path); + if (file_encryption_properties_) { + return file_encryption_properties_->column_properties(path); } else { return NULLPTR; } @@ -433,7 +433,7 @@ class PARQUET_EXPORT WriterProperties { MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, - std::shared_ptr file_encryption, + std::shared_ptr file_encryption_properties, const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -443,7 +443,7 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), - file_encryption_(file_encryption), + file_encryption_properties_(file_encryption_properties), default_column_properties_(default_column_properties), column_properties_(column_properties) {} @@ -454,7 +454,7 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; - std::shared_ptr file_encryption_; + std::shared_ptr file_encryption_properties_; ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; From b949797fc197ea9acf76fb024f063126d7227337 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:13:26 +0300 Subject: [PATCH 079/201] Use encrypted_footer instead of footer_signing_key when checking for encrypted footer --- cpp/src/parquet/metadata.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index f06fe135ab94..5da700048185 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1207,7 +1207,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); if (props->file_encryption_properties() != nullptr && - props->file_encryption_properties()->footer_signing_key().empty()) { + props->file_encryption_properties()->encrypted_footer()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } } From bd989f6cb0134ed5820e2ba20cc383d516d13154 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:18:24 +0300 Subject: [PATCH 080/201] Rename column_encryption_props to column_encryption_properties --- cpp/src/parquet/metadata.cc | 2 +- cpp/src/parquet/properties.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5da700048185..0e16345099b3 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -925,7 +925,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void WriteTo(::arrow::io::OutputStream* sink, const std::shared_ptr& encryptor) { ThriftSerializer serializer; - const auto& encrypt_md = properties_->column_encryption_props(column_->path()); + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is unencrypted if (!encrypt_md || !encrypt_md->is_encrypted()) { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 4caa07e5a35c..bed07a4cc57c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -419,7 +419,7 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } - std::shared_ptr column_encryption_props( + std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { return file_encryption_properties_->column_properties(path); From b6ff133190d26bbf23c163e219228717e1e3cdf8 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 12:58:43 +0300 Subject: [PATCH 081/201] Add comments in thrift.h --- cpp/src/parquet/thrift.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 1e2068c9646b..10b0729d9641 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -200,7 +200,7 @@ template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const std::shared_ptr& decryptor = NULLPTR, bool shouldReadLength = false) { - if (decryptor == NULLPTR) { + if (decryptor == NULLPTR) { // thrift message is not encrypted // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -216,7 +216,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } uint32_t bytes_left = tmem_transport->available_read(); *len = *len - bytes_left; - } else { + } else { // thrift message is encrypted uint32_t clen; if (shouldReadLength) { // first 4 bytes for length @@ -275,10 +275,10 @@ class ThriftSerializer { uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - if (encryptor == NULLPTR) { + if (encryptor == NULLPTR) { // obj is not encrypted PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); - } else { + } else { // obj is encrypted std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); From 1aa574701ab5b1215c63e765cdb290b57ff3e84b Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 13:19:20 +0300 Subject: [PATCH 082/201] Change parameters order in ColumnChunkMetaData::Make --- cpp/src/parquet/column_writer_test.cc | 8 ++++---- cpp/src/parquet/metadata.cc | 12 +++++++----- cpp/src/parquet/metadata.h | 8 +++++--- cpp/src/parquet/statistics_test.cc | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index c72efe47f90e..cee45c0c6f82 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -245,8 +245,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = ColumnChunkMetaData::Make( - metadata_->contents(), this->descr_, -1, -1, &app_version); + auto metadata_accessor = + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); return metadata_accessor->is_stats_set(); } @@ -255,8 +255,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { // This is because the ColumnChunkMetaData semantics dictate the metadata object is // complete (no changes to the metadata buffer can be made after instantiation) ApplicationVersion app_version(this->writer_properties_->created_by()); - auto metadata_accessor = ColumnChunkMetaData::Make( - metadata_->contents(), this->descr_, -1, -1, &app_version); + auto metadata_accessor = + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_, &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); return {encoded_stats.has_min, encoded_stats.has_max}; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 0e16345099b3..c21451aac692 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -292,9 +292,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, - int16_t column_ordinal, const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor) { + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor, + int16_t row_group_ordinal, + int16_t column_ordinal) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, file_decryptor)); @@ -399,8 +401,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - row_group_ordinal, (int16_t)i, writer_version_, - file_decryptor); + writer_version_, file_decryptor, + row_group_ordinal, (int16_t)i); } private: diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 4e10c63465a6..e4ce4e5ea6f4 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -124,9 +124,11 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal = -1, - int16_t column_ordinal = -1, const ApplicationVersion* writer_version = NULLPTR, - InternalFileDecryptor* file_decryptor = NULLPTR); + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULLPTR, + InternalFileDecryptor* file_decryptor = NULLPTR, + int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1); ~ColumnChunkMetaData(); diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 0022f16e1db3..bc1953a92f55 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -517,7 +517,7 @@ void AssertStatsSet(const ApplicationVersion& version, const ColumnDescriptor* column, bool expected_is_set) { auto metadata_builder = ColumnChunkMetaDataBuilder::Make(props, column); auto column_chunk = - ColumnChunkMetaData::Make(metadata_builder->contents(), column, -1, -1, &version); + ColumnChunkMetaData::Make(metadata_builder->contents(), column, &version); EncodedStatistics stats; stats.set_is_signed(false); metadata_builder->SetStatistics(stats); From 329633f07b9b4333701c60a7a6efa10fa41ab75c Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 13:33:23 +0300 Subject: [PATCH 083/201] Change parameters order in PageReader::Open --- cpp/src/parquet/column_reader.cc | 8 ++++---- cpp/src/parquet/column_reader.h | 5 +++-- cpp/src/parquet/file_reader.cc | 14 ++++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index a4fec0d69940..a6fc58429835 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -357,12 +357,12 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, - int16_t column_ordinal, ::arrow::MemoryPool* pool, + Compression::type codec, ::arrow::MemoryPool* pool, + bool column_has_dictionary, int16_t row_group_ordinal, int16_t column_ordinal, std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) { return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, column_has_dictionary, - row_group_ordinal, column_ordinal, pool, meta_decryptor, data_decryptor)); + new SerializedPageReader(stream, total_num_rows, codec, pool, column_has_dictionary, + row_group_ordinal, column_ordinal, meta_decryptor, data_decryptor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 8e07e9a01044..f7741ede126f 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -81,9 +81,10 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, bool column_has_dictionary = false, - int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, + Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool column_has_dictionary = false, + int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, std::shared_ptr meta_decryptor = NULLPTR, std::shared_ptr data_decryptor = NULLPTR); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 65f3ba3fa63e..d869dc512ecb 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -135,8 +135,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, - (int16_t)i/* column_ordinal */, properties_.memory_pool()); + properties_.memory_pool(), col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i/* column_ordinal */, ); } // The column is encrypted @@ -147,8 +147,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - properties_.memory_pool(), meta_decryptor, data_decryptor); + properties_.memory_pool(), col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i, meta_decryptor, + data_decryptor); } // The column is encrypted with its own key @@ -162,8 +163,9 @@ class SerializedRowGroup : public RowGroupReader::Contents { file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); return PageReader::Open(stream, col->num_values(), col->compression(), - col->has_dictionary_page(), row_group_ordinal_, (int16_t)i, - properties_.memory_pool(), meta_decryptor, data_decryptor); + properties_.memory_pool(), col->has_dictionary_page(), + row_group_ordinal_, (int16_t)i, meta_decryptor, + data_decryptor); } private: From 0dfd5f294daee9e01a46e6c39eb332e95643e8c6 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 13:57:56 +0300 Subject: [PATCH 084/201] Remove footer_encryption_key and footer_signing_key --- cpp/src/parquet/encryption.h | 18 ++++-------------- cpp/src/parquet/file_writer.cc | 2 +- cpp/src/parquet/internal_file_encryptor.cc | 20 +++++++++++++------- cpp/src/parquet/metadata.cc | 10 +++------- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index e1f66b7fa8e6..b0ed3cc608cf 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -480,20 +480,12 @@ class PARQUET_EXPORT FileEncryptionProperties { const EncryptionAlgorithm algorithm() { return algorithm_; } - const std::string& footer_encryption_key() { - return (encrypted_footer_ ? footer_key_ : empty_string_); + const std::string& footer_key() { + return footer_key_; } - const std::string& footer_encryption_key_metadata() { - return (encrypted_footer_ ? footer_key_metadata_ : empty_string_); - } - - const std::string& footer_signing_key() { - return (encrypted_footer_ ? empty_string_ : footer_key_); - } - - const std::string& footer_signing_key_metadata() { - return (encrypted_footer_ ? empty_string_ : footer_key_metadata_); + const std::string& footer_key_metadata() { + return footer_key_metadata_; } const std::string& file_aad() const { return file_aad_; } @@ -508,8 +500,6 @@ class PARQUET_EXPORT FileEncryptionProperties { bool encrypted_footer_; std::string file_aad_; - const std::string empty_string_ = ""; - std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> column_properties_; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 759214f6b5eb..d6d7e47bfc9c 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -317,7 +317,7 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( &signing_encryption, - file_encryption_properties->footer_signing_key_metadata()); + file_encryption_properties->footer_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index ac4b6aee3888..e39ecd950117 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -62,9 +62,14 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { if (footer_encryptor_ != NULLPTR) { return footer_encryptor_; } + + if (!properties_->encrypted_footer()) { + throw ParquetException("Requesting footer encryptor in file " + "with unencrypted footer"); + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); - std::string footer_key = properties_->footer_encryption_key(); + std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( aes_encryptor, footer_key, properties_->file_aad(), footer_aad); @@ -76,9 +81,14 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { if (footer_signing_encryptor_ != NULLPTR) { return footer_signing_encryptor_; } + + if (properties_->encrypted_footer()) { + throw ParquetException("Requesting signing footer encryptor in file " + "with encrypted footer"); + } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); - std::string footer_signing_key = properties_->footer_signing_key(); + std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); @@ -116,11 +126,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( std::string key; if (column_prop->is_encrypted_with_footer_key()) { - if (properties_->encrypted_footer()) { - key = properties_->footer_encryption_key(); - } else { - key = properties_->footer_signing_key(); - } + key = properties_->footer_key(); } else { key = column_prop->key(); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c21451aac692..9499ef9ad367 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -952,8 +952,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - // TODO: check file_encryption_properties() is null or not - auto footer_key = properties_->file_encryption_properties()->footer_encryption_key(); + DCHECK(properties_->file_encryption_properties()); + auto footer_key = properties_->file_encryption_properties()->footer_key(); // non-uniform: footer is unencrypted, or column is encrypted with a column-specific // key @@ -1293,11 +1293,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption_properties = properties_->file_encryption_properties(); crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption_properties->algorithm())); - std::string key_metadata; - if (file_encryption_properties->encrypted_footer()) - key_metadata = file_encryption_properties->footer_encryption_key_metadata(); - else - key_metadata = file_encryption_properties->footer_signing_key_metadata(); + std::string key_metadata = file_encryption_properties->footer_key_metadata(); if (!key_metadata.empty()) { crypto_metadata_->__set_key_metadata(key_metadata); From 6bf0d6263ce4bec43a9f8ceb5bad61aee3a81905 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 14:28:03 +0300 Subject: [PATCH 085/201] Remove ParquetException in GetFooterSigningEncryptor and GetFooterEncryptor --- cpp/src/parquet/internal_file_encryptor.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index e39ecd950117..479da97ba34e 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -63,10 +63,6 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { return footer_encryptor_; } - if (!properties_->encrypted_footer()) { - throw ParquetException("Requesting footer encryptor in file " - "with unencrypted footer"); - } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_key = properties_->footer_key(); @@ -82,10 +78,6 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { return footer_signing_encryptor_; } - if (properties_->encrypted_footer()) { - throw ParquetException("Requesting signing footer encryptor in file " - "with encrypted footer"); - } ParquetCipher::type algorithm = properties_->algorithm().algorithm; std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); std::string footer_signing_key = properties_->footer_key(); From 16f5d784c7b3f630ef21d2fbd8e6243aa6bdbe45 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 15:20:08 +0300 Subject: [PATCH 086/201] make format --- cpp/src/parquet/column_reader.h | 8 +++----- cpp/src/parquet/encryption.h | 8 ++------ cpp/src/parquet/file_reader.cc | 5 ++--- cpp/src/parquet/file_writer.cc | 17 ++++++++--------- cpp/src/parquet/internal_file_decryptor.cc | 12 +++++------- cpp/src/parquet/metadata.cc | 17 ++++++++--------- cpp/src/parquet/metadata.h | 6 +++--- 7 files changed, 31 insertions(+), 42 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index f7741ede126f..563613f0ccb3 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -81,11 +81,9 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool column_has_dictionary = false, - int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, - std::shared_ptr meta_decryptor = NULLPTR, + Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool column_has_dictionary = false, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, std::shared_ptr meta_decryptor = NULLPTR, std::shared_ptr data_decryptor = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index b0ed3cc608cf..5ad94e99934b 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -480,13 +480,9 @@ class PARQUET_EXPORT FileEncryptionProperties { const EncryptionAlgorithm algorithm() { return algorithm_; } - const std::string& footer_key() { - return footer_key_; - } + const std::string& footer_key() { return footer_key_; } - const std::string& footer_key_metadata() { - return footer_key_metadata_; - } + const std::string& footer_key_metadata() { return footer_key_metadata_; } const std::string& file_aad() const { return file_aad_; } diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index d869dc512ecb..038bbbe94b5a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -302,9 +302,8 @@ class SerializedFile : public ParquetFileReader::Contents { } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify_signature(encryptor, - metadata_buffer->data() - + read_metadata_len)) { + if (!file_metadata_->verify_signature( + encryptor, metadata_buffer->data() + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " "footer metadata"); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d6d7e47bfc9c..21a870d0b04f 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -288,11 +288,11 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); - if (file_encryption_properties == nullptr) { // plaintext regular file + if (file_encryption_properties == nullptr) { // plaintext regular file file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); - } else { // Encrypted file - //Encrypted file with encrypted footer + } else { // Encrypted file + // Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { // encrypted footer file_metadata_ = metadata_->Finish(); @@ -307,7 +307,7 @@ class FileSerializer : public ParquetFileWriter::Contents { static_cast(sink_->Tell() - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); - } else { // Encrypted file with plaintext footer + } else { // Encrypted file with plaintext footer EncryptionAlgorithm signing_encryption; EncryptionAlgorithm algo = file_encryption_properties->algorithm(); signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; @@ -316,8 +316,7 @@ class FileSerializer : public ParquetFileWriter::Contents { signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; file_metadata_ = metadata_->Finish( - &signing_encryption, - file_encryption_properties->footer_key_metadata()); + &signing_encryption, file_encryption_properties->footer_key_metadata()); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); @@ -441,7 +440,7 @@ std::unique_ptr ParquetFileWriter::Open( void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, bool encrypt_footer) { - if (encryptor == nullptr) { // plaintext regular file + if (encryptor == nullptr) { // plaintext regular file // Write MetaData int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); @@ -454,11 +453,11 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { // Encrypted file + } else { // Encrypted file if (encrypt_footer) { // Encrypted file with encrypted footer // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); - } else { // Encrypted file with plaintext footer + } else { // Encrypted file with plaintext footer uint32_t metadata_len = static_cast(sink->Tell()); file_metadata.WriteTo(sink, encryptor); metadata_len = static_cast(sink->Tell()) - metadata_len; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d08b807c9adf..d2f4ea139c9b 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -167,15 +167,14 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); std::shared_ptr footer_metadata_decryptor = - std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); std::shared_ptr footer_data_decryptor = - std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); footer_metadata_decryptor_ = footer_metadata_decryptor; footer_data_decryptor_ = footer_data_decryptor; - if (metadata) - return footer_metadata_decryptor; + if (metadata) return footer_metadata_decryptor; return footer_data_decryptor; } @@ -232,13 +231,12 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::shared_ptr metadata_decryptor = std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); std::shared_ptr data_decryptor = - std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); (*column_metadata_map_)[column_path] = metadata_decryptor; (*column_data_map_)[column_path] = data_decryptor; - if (metadata) - return metadata_decryptor; + if (metadata) return metadata_decryptor; return data_decryptor; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9499ef9ad367..1fba009fe03b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -293,10 +293,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor, - int16_t row_group_ordinal, - int16_t column_ordinal) { + const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor, + int16_t row_group_ordinal, int16_t column_ordinal) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, writer_version, file_decryptor)); @@ -401,8 +399,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryptor, - row_group_ordinal, (int16_t)i); + writer_version_, file_decryptor, row_group_ordinal, + (int16_t)i); } private: @@ -529,7 +527,7 @@ class FileMetaData::FileMetaDataImpl { dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, parquet_encryption::GCMTagLength); - } else { // either plaintext file (when encryptor is null) + } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); } @@ -639,7 +637,7 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { } bool FileMetaData::verify_signature(std::shared_ptr encryptor, - const void* tail) { + const void* tail) { return impl_->verify_signature(encryptor, tail); } @@ -1292,7 +1290,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto file_encryption_properties = properties_->file_encryption_properties(); - crypto_metadata_->__set_encryption_algorithm(ToThrift(file_encryption_properties->algorithm())); + crypto_metadata_->__set_encryption_algorithm( + ToThrift(file_encryption_properties->algorithm())); std::string key_metadata = file_encryption_properties->footer_key_metadata(); if (!key_metadata.empty()) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index e4ce4e5ea6f4..cb12197de734 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -126,8 +126,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, - InternalFileDecryptor* file_decryptor = NULLPTR, - int16_t row_group_ordinal = -1, + InternalFileDecryptor* file_decryptor = NULLPTR, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1); ~ColumnChunkMetaData(); @@ -203,7 +202,8 @@ class PARQUET_EXPORT FileMetaData { ~FileMetaData(); - bool verify_signature(std::shared_ptr encryptor, const void* tail); + bool verify_signature(std::shared_ptr encryptor, + const void* tail); // file metadata uint32_t size() const; From db94e056358142c1e534ea15dcd9ad48951d8a08 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 14 May 2019 15:32:03 +0300 Subject: [PATCH 087/201] make format in thrift.h --- cpp/src/parquet/thrift.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 10b0729d9641..497a0e0785c8 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -200,7 +200,8 @@ template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const std::shared_ptr& decryptor = NULLPTR, bool shouldReadLength = false) { - if (decryptor == NULLPTR) { // thrift message is not encrypted + // thrift message is not encrypted + if (decryptor == NULLPTR) { // Deserialize msg bytes into c++ thrift msg using memory transport. shared_ptr tmem_transport( new ThriftBuffer(const_cast(buf), *len)); @@ -216,7 +217,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali } uint32_t bytes_left = tmem_transport->available_read(); *len = *len - bytes_left; - } else { // thrift message is encrypted + } else { // thrift message is encrypted uint32_t clen; if (shouldReadLength) { // first 4 bytes for length @@ -275,10 +276,11 @@ class ThriftSerializer { uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); - if (encryptor == NULLPTR) { // obj is not encrypted + // obj is not encrypted + if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); - } else { // obj is encrypted + } else { // obj is encrypted std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); From 18876be7883f3741576063242dda9d6482788af9 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Wed, 15 May 2019 09:49:41 +0700 Subject: [PATCH 088/201] fix rebase mistake in parquet.thrift --- cpp/src/parquet/parquet.thrift | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index 000b74dde1c8..b6795e3d4874 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -570,7 +570,7 @@ struct PageHeader { /** Uncompressed page size in bytes (not including this header) **/ 2: required i32 uncompressed_page_size - /** Compressed page size in bytes (not including this header) **/ + /** Compressed (and potentially encrypted) page size in bytes, not including this header **/ 3: required i32 compressed_page_size /** 32bit crc for the data below. This allows for disabling checksumming in HDFS @@ -874,7 +874,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -955,6 +955,7 @@ struct FileMetaData { 9: optional binary footer_signing_key_metadata } +/** Crypto metadata for files with encrypted footer **/ struct FileCryptoMetaData { /** * Encryption algorithm. This field is only used for files @@ -967,3 +968,5 @@ struct FileCryptoMetaData { * and (possibly) columns **/ 2: optional binary key_metadata } + + From 96af8cbd6dbdc1da352f2a5193787e742ac60775 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Sun, 19 May 2019 16:29:12 +0300 Subject: [PATCH 089/201] Fix aad settings in thrift.h --- cpp/src/parquet/thrift.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 497a0e0785c8..73b1ab007c5d 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -162,17 +162,29 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { format::AesGcmV1 aesGcmV1; - aesGcmV1.__set_aad_prefix(aad.aad_prefix); + // aad_file_unique is always set + aesGcmV1.__isset.aad_file_unique = true; aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmV1.__isset.supply_aad_prefix = true; aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmV1.__isset.aad_prefix = true; + aesGcmV1.__set_aad_prefix(aad.aad_prefix); + } return aesGcmV1; } static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { format::AesGcmCtrV1 aesGcmCtrV1; - aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + // aad_file_unique is always set + aesGcmCtrV1.__isset.aad_file_unique = true; aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); + aesGcmCtrV1.__isset.supply_aad_prefix = true; aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); + if (!aad.aad_prefix.empty()) { + aesGcmCtrV1.__isset.aad_prefix = true; + aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); + } return aesGcmCtrV1; } From 3da31c2824ef4015c2a28b7d6565ce27a5aee953 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 20 May 2019 09:18:28 +0300 Subject: [PATCH 090/201] Port key erasure mechanism --- cpp/src/parquet/encryption.cc | 13 +++ cpp/src/parquet/encryption.h | 101 +++++++++++++++++++-- cpp/src/parquet/file_reader.cc | 11 ++- cpp/src/parquet/file_writer.cc | 3 + cpp/src/parquet/internal_file_decryptor.cc | 41 ++++++--- cpp/src/parquet/internal_file_decryptor.h | 4 + cpp/src/parquet/internal_file_encryptor.cc | 35 ++++--- cpp/src/parquet/internal_file_encryptor.h | 4 + 8 files changed, 177 insertions(+), 35 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index c53de6746879..9666e9ae2bb4 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -65,6 +65,12 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, const std::string& key_metadata) : column_path_(column_path) { + // column encryption properties object (with a column key) can be used for writing only + // one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + DCHECK(column_path != nullptr); if (!encrypted) { DCHECK(key.empty() && key_metadata.empty()); @@ -87,6 +93,7 @@ ColumnEncryptionProperties::ColumnEncryptionProperties( ColumnDecryptionProperties::ColumnDecryptionProperties( const std::shared_ptr& column_path, const std::string& key) : column_path_(column_path) { + utilized_ = false; DCHECK(column_path != nullptr); if (!key.empty()) { @@ -133,6 +140,7 @@ FileDecryptionProperties::FileDecryptionProperties( aad_prefix_ = aad_prefix; column_properties_ = column_properties; plaintext_files_allowed_ = plaintext_files_allowed; + utilized_ = false; } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id( @@ -176,6 +184,11 @@ FileEncryptionProperties::FileEncryptionProperties( footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), column_properties_(column_properties) { + // file encryption properties object can be used for writing only one file. + // Upon completion of file writing, the encryption keys in the properties will be wiped + // out (set to 0 in memory). + utilized_ = false; + DCHECK(!footer_key.empty()); // footer_key must be either 16, 24 or 32 bytes. DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 5ad94e99934b..05f8da611b9d 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/util/logging.h" #include "parquet/encryption.h" @@ -101,11 +102,14 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // If key is not set on an encrypted column, the column will // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; + // The key is cloned, and will be wiped out (array values set to 0) upon completion of + // file reading. + // Caller is responsible for wiping out the input key array. + Builder* key(std::string column_key) { + if (column_key.empty()) return this; + + DCHECK(key_.empty()); + key_ = column_key; return this; } @@ -143,6 +147,19 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } const std::string& key() const { return key_; } const std::string& key_metadata() const { return key_metadata_; } + void wipeout_encryption_key() { + if (!key_.empty()) { + std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); + } + } + + bool is_utilized() { + if (key_.empty()) + return false; // can re-use column properties without encryption keys + return utilized_; + } + + void set_utilized() { utilized_ = true; } ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; @@ -154,6 +171,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool encrypted_with_footer_key_; std::string key_; std::string key_metadata_; + bool utilized_; explicit ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, const std::string& key_metadata); @@ -198,10 +216,20 @@ class PARQUET_EXPORT ColumnDecryptionProperties { const std::shared_ptr& column_path() { return column_path_; } const std::string& key() const { return key_; } + bool is_utilized() { return utilized_; } + + void set_utilized() { utilized_ = true; } + + void wipeout_decryption_key() { + if (!key_.empty()) { + std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); + } + } private: const std::shared_ptr column_path_; std::string key_; + bool utilized_; // This class is only required for setting explicit column decryption keys - // to override key retriever (or to provide keys when key metadata and/or @@ -235,12 +263,15 @@ class PARQUET_EXPORT FileDecryptionProperties { // If explicit key is not set, footer key will be fetched from // key retriever. // param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(const std::string& footer_key) { - if (footer_key.empty()) { + // The key is cloned, and will be wiped out (array values set to 0) upon completion of + // file reading. + // Caller is responsible for wiping out the input key array. + Builder* footer_key(const std::string column_key) { + if (column_key.empty()) { return this; } - DCHECK(!footer_key.empty()); - footer_key_ = footer_key; + DCHECK(footer_key_.empty()); + footer_key_ = column_key; return this; } @@ -259,6 +290,15 @@ class PARQUET_EXPORT FileDecryptionProperties { if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + column_properties_ = column_properties; return this; } @@ -357,6 +397,26 @@ class PARQUET_EXPORT FileDecryptionProperties { return aad_prefix_verifier_; } + void wipeout_decryption_keys() { + if (!footer_key_.empty()) + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_decryption_key(); + } + } + + bool is_utilized() { + if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) + return false; + + return utilized_; + } + + void set_utilized() { utilized_ = true; } + private: std::string footer_key_; std::string aad_prefix_; @@ -371,6 +431,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; bool plaintext_files_allowed_; + bool utilized_; FileDecryptionProperties( const std::string& footer_key, @@ -453,6 +514,14 @@ class PARQUET_EXPORT FileEncryptionProperties { if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } column_properties_ = column_properties; return this; } @@ -489,12 +558,26 @@ class PARQUET_EXPORT FileEncryptionProperties { std::shared_ptr column_properties( const std::shared_ptr& column_path); + bool is_utilized() { return utilized_; } + + void set_utilized() { utilized_ = true; } + + void wipeout_encryption_keys() { + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_encryption_key(); + } + } + private: EncryptionAlgorithm algorithm_; std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; std::string file_aad_; + bool utilized_; std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 038bbbe94b5a..69c0d3965999 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -189,7 +189,16 @@ class SerializedFile : public ParquetFileReader::Contents { const ReaderProperties& props = default_reader_properties()) : source_(source), properties_(props) {} - void Close() override {} + ~SerializedFile() override { + try { + Close(); + } catch (...) { + } + } + + void Close() override { + if (file_decryptor_) file_decryptor_->wipeout_decryption_keys(); + } std::shared_ptr GetRowGroup(int i) override { std::unique_ptr contents(new SerializedRowGroup( diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 21a870d0b04f..f16912151277 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -321,6 +321,9 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); } + if (file_encryptor_) { + file_encryptor_->wipeout_encryption_keys(); + } } sink_->Close(); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d2f4ea139c9b..b112896c8fcd 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -35,7 +35,7 @@ FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& aad) : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { aes_encryptor_.reset(new parquet_encryption::AesEncryptor( - algorithm, static_cast(key_.size()), true)); + algorithm, static_cast(key_.size()), true, NULLPTR)); } int FooterSigningEncryptor::CiphertextSizeDelta() { @@ -74,6 +74,14 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie file_aad_(file_aad), algorithm_(algorithm), footer_key_metadata_(footer_key_metadata) { + if (properties_->is_utilized()) { + throw ParquetException( + "Re-using decryption properties with explicit keys for another file"); + } + properties_->set_utilized(); + + all_decryptors_ = std::shared_ptr>( + new std::list); column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( @@ -87,6 +95,13 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie schema::ColumnPath::CmpColumnPath>()); } +void InternalFileDecryptor::wipeout_decryption_keys() { + properties_->wipeout_decryption_keys(); + for (auto const& i : *all_decryptors_) { + i->WipeOut(); + } +} + std::shared_ptr InternalFileDecryptor::GetFooterSigningEncryptor() { if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; @@ -245,20 +260,20 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); + meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); + meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, true)); + meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_256_.get(); } @@ -270,20 +285,20 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset( - new parquet_encryption::AesDecryptor(algorithm_, key_len, false)); + data_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 1e0a587b6f90..fd13a6f289ff 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -18,6 +18,7 @@ #ifndef INTERNAL_FILE_DECRYPTOR_H #define INTERNAL_FILE_DECRYPTOR_H +#include #include #include #include @@ -85,6 +86,8 @@ class InternalFileDecryptor { FileDecryptionProperties* properties() { return properties_; } + void wipeout_decryption_keys(); + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); @@ -113,6 +116,7 @@ class InternalFileDecryptor { ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_signing_encryptor_; + std::shared_ptr> all_decryptors_; std::unique_ptr meta_decryptor_128_; std::unique_ptr meta_decryptor_196_; diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 479da97ba34e..99f18b38fbd1 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -45,6 +45,9 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { + all_encryptors_ = std::shared_ptr>( + new std::list); + column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( @@ -58,6 +61,14 @@ InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* propertie schema::ColumnPath::CmpColumnPath>()); } +void InternalFileEncryptor::wipeout_encryption_keys() { + properties_->wipeout_encryption_keys(); + + for (auto const& i : *all_encryptors_) { + i->WipeOut(); + } +} + std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { if (footer_encryptor_ != NULLPTR) { return footer_encryptor_; @@ -143,20 +154,20 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_128_.get(); } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_196_.get(); } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, true)); + meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_256_.get(); } @@ -168,20 +179,20 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, false, all_encryptors_)); } return data_encryptor_128_.get(); } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, false, all_encryptors_)); } return data_encryptor_196_.get(); } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset( - new parquet_encryption::AesEncryptor(algorithm, key_len, false)); + data_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + algorithm, key_len, false, all_encryptors_)); } return data_encryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 7fcb943735cf..40d00bb5778f 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -18,6 +18,7 @@ #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H +#include #include #include #include @@ -59,6 +60,7 @@ class InternalFileEncryptor { const std::shared_ptr& column_path); std::shared_ptr GetColumnDataEncryptor( const std::shared_ptr& column_path); + void wipeout_encryption_keys(); private: FileEncryptionProperties* properties_; @@ -75,6 +77,8 @@ class InternalFileEncryptor { std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; + std::shared_ptr> all_encryptors_; + std::unique_ptr meta_encryptor_128_; std::unique_ptr meta_encryptor_196_; std::unique_ptr meta_encryptor_256_; From 57c484050d4bb6d4e1a10998711236f0100f52cf Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 20 May 2019 10:18:24 +0300 Subject: [PATCH 091/201] Fix columnMetaData --- cpp/src/parquet/internal_file_encryptor.h | 14 +++++ cpp/src/parquet/metadata.cc | 62 ++++++++++++++--------- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 40d00bb5778f..7b382de461ee 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -23,6 +23,7 @@ #include #include +#include "parquet/encryption.h" #include "parquet/schema.h" namespace parquet_encryption { @@ -32,6 +33,7 @@ class AesEncryptor; namespace parquet { class FileEncryptionProperties; +class ColumnEncryptionProperties; class Encryptor { public: @@ -43,6 +45,18 @@ class Encryptor { int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); + bool encryptColumnMetaData( + bool encrypted_footer, + const std::shared_ptr& column_encryption_properties) { + // if column is not encrypted then do not encrypt the column metadata + if (!column_encryption_properties || !column_encryption_properties->is_encrypted()) + return false; + // if plaintext footer then encrypt the column metadata + if (!encrypted_footer) return true; + // if column is not encrypted with footer key then encrypt the column metadata + return !column_encryption_properties->is_encrypted_with_footer_key(); + } + private: parquet_encryption::AesEncryptor* aes_encryptor_; std::string key_; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1fba009fe03b..460311724ace 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -925,20 +925,21 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void WriteTo(::arrow::io::OutputStream* sink, const std::shared_ptr& encryptor) { ThriftSerializer serializer; - const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is unencrypted - if (!encrypt_md || !encrypt_md->is_encrypted()) { + if (encryptor == NULLPTR) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); serializer.Serialize(column_chunk_, sink); } else { // column is encrypted + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + bool encrypt_metadata = encryptor->encryptColumnMetaData( + properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); column_chunk_->__isset.crypto_metadata = true; - - // encrypted with footer key format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { + // encrypted with footer key ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true; ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey()); } else { // encrypted with column key @@ -950,17 +951,15 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); - DCHECK(properties_->file_encryption_properties()); - auto footer_key = properties_->file_encryption_properties()->footer_key(); - - // non-uniform: footer is unencrypted, or column is encrypted with a column-specific - // key - if ((footer_key.empty() && encrypt_md->is_encrypted()) || - !encrypt_md->is_encrypted_with_footer_key()) { + if (!encrypt_metadata) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + } else { // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; uint32_t serialized_len; + serializer.SerializeToBuffer(&column_metadata_, &serialized_len, &serialized_data); @@ -974,25 +973,40 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); // Keep redacted metadata version for old readers - if (footer_key.empty()) { - format::ColumnMetaData metadata_redacted = column_metadata_; - if (metadata_redacted.__isset.statistics) { - metadata_redacted.__isset.statistics = false; + if (!properties_->file_encryption_properties()->encrypted_footer()) { + // metadata_redacted should be stripped of the column_metadata_ statistics. + format::ColumnMetaData metadata_redacted; + metadata_redacted.__set_type(column_metadata_.type); + metadata_redacted.__set_encodings(column_metadata_.encodings); + metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); + metadata_redacted.__set_codec(column_metadata_.codec); + metadata_redacted.__set_num_values(column_metadata_.num_values); + metadata_redacted.__set_total_uncompressed_size( + column_metadata_.total_uncompressed_size); + metadata_redacted.__set_total_compressed_size( + column_metadata_.total_compressed_size); + if (column_metadata_.__isset.key_value_metadata) { + metadata_redacted.__isset.key_value_metadata = true; + metadata_redacted.__set_key_value_metadata( + column_metadata_.key_value_metadata); + } + metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); + if (column_metadata_.__isset.index_page_offset) { + metadata_redacted.__isset.index_page_offset = true; + metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); } - if (metadata_redacted.__isset.encoding_stats) { - metadata_redacted.__isset.encoding_stats = false; + if (column_metadata_.__isset.dictionary_page_offset) { + metadata_redacted.__isset.dictionary_page_offset = true; + metadata_redacted.__set_dictionary_page_offset( + column_metadata_.dictionary_page_offset); } + metadata_redacted.__isset.statistics = false; + metadata_redacted.__isset.encoding_stats = false; + column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(metadata_redacted); - } else { - // don't set meta_data - column_chunk_->__isset.meta_data = true; } - } else { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); } - serializer.Serialize(column_chunk_, sink); } } From e2f7cabd045cdc508781e24fdb23f8f4ea64e5bb Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 20 May 2019 10:49:33 +0300 Subject: [PATCH 092/201] Minor fixes to previous code --- cpp/src/parquet/encryption.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 05f8da611b9d..90d9361b2b8f 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -103,7 +103,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { // be encrypted with the footer key. // keyBytes Key length must be either 16, 24 or 32 bytes. // The key is cloned, and will be wiped out (array values set to 0) upon completion of - // file reading. + // file writing. // Caller is responsible for wiping out the input key array. Builder* key(std::string column_key) { if (column_key.empty()) return this; @@ -262,16 +262,19 @@ class PARQUET_EXPORT FileDecryptionProperties { // will be decrypted/verified with this key. // If explicit key is not set, footer key will be fetched from // key retriever. - // param footerKey Key length must be either 16, 24 or 32 bytes. - // The key is cloned, and will be wiped out (array values set to 0) upon completion of - // file reading. + // With explicit keys or AAD prefix, new encryption properties object must be created + // for each encrypted file. + // Explicit encryption keys (footer and column) are cloned. + // Upon completion of file reading, the cloned encryption keys in the properties will + // be wiped out (array values set to 0). // Caller is responsible for wiping out the input key array. - Builder* footer_key(const std::string column_key) { - if (column_key.empty()) { + // param footerKey Key length must be either 16, 24 or 32 bytes. + Builder* footer_key(const std::string footer_key) { + if (footer_key.empty()) { return this; } DCHECK(footer_key_.empty()); - footer_key_ = column_key; + footer_key_ = footer_key; return this; } From b6dfe9c341705b2a22062eb959cbc2346ffb4cb5 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Fri, 17 May 2019 17:29:11 +0700 Subject: [PATCH 093/201] fix build issue on MacOS --- cpp/src/parquet/column_reader.cc | 3 ++- cpp/src/parquet/column_writer.cc | 4 ++-- cpp/src/parquet/encryption.h | 1 + cpp/src/parquet/internal_file_decryptor.cc | 2 +- cpp/src/parquet/internal_file_decryptor.h | 1 - 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index a6fc58429835..d6f0d7b033ef 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -282,7 +282,8 @@ std::shared_ptr SerializedPageReader::NextPage() { // Decrypt it if we need to if (data_decryptor_ != nullptr) { - decryption_buffer_->Resize(compressed_len - data_decryptor_->CiphertextSizeDelta()); + PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( + compressed_len - data_decryptor_->CiphertextSizeDelta())); compressed_len = data_decryptor_->Decrypt(buffer, compressed_len, decryption_buffer_->mutable_data()); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index cf357450bc28..e64c3664d412 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -290,8 +290,8 @@ class SerializedPageWriter : public PageWriter { if (data_encryptor_.get()) { parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); data_encryptor_->update_aad(data_pageAAD_); - encrypted_data_buffer->Resize(data_encryptor_->CiphertextSizeDelta() + - output_data_len); + PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( + data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 90d9361b2b8f..89541f75c266 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -246,6 +246,7 @@ class PARQUET_EXPORT AADPrefixVerifier { // In a data set, AAD Prefixes should be collected, // and then checked for missing files. virtual void check(const std::string& aad_prefix) = 0; + virtual ~AADPrefixVerifier() {} }; class PARQUET_EXPORT FileDecryptionProperties { diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index b112896c8fcd..11e44b7415da 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -33,7 +33,7 @@ FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, const std::string& file_aad, const std::string& aad) - : algorithm_(algorithm), key_(key), file_aad_(file_aad), aad_(aad) { + : key_(key), file_aad_(file_aad), aad_(aad) { aes_encryptor_.reset(new parquet_encryption::AesEncryptor( algorithm, static_cast(key_.size()), true, NULLPTR)); } diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index fd13a6f289ff..4a8e0e7b0d49 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -43,7 +43,6 @@ class FooterSigningEncryptor { uint8_t* encrypted_footer); private: - ParquetCipher::type algorithm_; std::string key_; std::string file_aad_; std::string aad_; From 049d69ce7ad53ab2fa90cd97aba6bfe3ba07f30e Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Thu, 30 May 2019 10:32:16 +0700 Subject: [PATCH 094/201] apply change from crypto package --- cpp/src/parquet/CMakeLists.txt | 4 +-- cpp/src/parquet/column_reader.cc | 23 +++++++-------- cpp/src/parquet/column_writer.cc | 22 +++++++------- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/file_writer.cc | 3 +- cpp/src/parquet/internal_file_decryptor.cc | 30 +++++++++---------- cpp/src/parquet/internal_file_decryptor.h | 34 +++++++++++----------- cpp/src/parquet/internal_file_encryptor.cc | 28 +++++++++--------- cpp/src/parquet/internal_file_encryptor.h | 31 ++++++++++---------- cpp/src/parquet/metadata.cc | 16 +++++----- cpp/src/parquet/thrift.h | 4 ++- 11 files changed, 99 insertions(+), 98 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 9e1638957e84..96a84ff27eb9 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -168,6 +168,7 @@ set(PARQUET_SRCS deprecated_io.cc encoding.cc encryption.cc + encryption_internal.cc internal_file_decryptor.cc internal_file_encryptor.cc file_reader.cc @@ -181,8 +182,7 @@ set(PARQUET_SRCS properties.cc schema.cc statistics.cc - types.cc - util/crypto.cc) + types.cc) # Ensure that thrift compilation is done before using its generated headers # in parquet code. diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index d6f0d7b033ef..ed077c0821ac 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -40,13 +40,12 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" #include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" // IWYU pragma: keep -#include "parquet/util/crypto.h" - using arrow::MemoryPool; using arrow::internal::checked_cast; @@ -142,14 +141,14 @@ class SerializedPageReader : public PageReader { if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); // prepare the AAD for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, + data_pageAAD_ = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { DCHECK(!meta_decryptor_->file_aad().empty()); - data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), parquet_encryption::DataPageHeader, + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); } } @@ -227,12 +226,12 @@ std::shared_ptr SerializedPageReader::NextPage() { if (meta_decryptor_ != NULLPTR) { if (current_page_is_dictionary) { std::string dictionary_page_header_aad; - dictionary_page_header_aad = parquet_encryption::createModuleAAD( - meta_decryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + dictionary_page_header_aad = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); meta_decryptor_->update_aad(dictionary_page_header_aad); } else { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); meta_decryptor_->update_aad(data_page_headerAAD_); } } @@ -260,12 +259,12 @@ std::shared_ptr SerializedPageReader::NextPage() { DCHECK(!data_decryptor_->file_aad().empty()); if (current_page_is_dictionary) { std::string dictionary_page_aad; - dictionary_page_aad = parquet_encryption::createModuleAAD( - data_decryptor_->file_aad(), parquet_encryption::DictionaryPage, + dictionary_page_aad = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); data_decryptor_->update_aad(dictionary_page_aad); } else { - parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); data_decryptor_->update_aad(data_pageAAD_); } } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e64c3664d412..f540efc1d5da 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -38,6 +38,7 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" @@ -46,7 +47,6 @@ #include "parquet/statistics.h" #include "parquet/thrift.h" #include "parquet/types.h" -#include "parquet/util/crypto.h" namespace parquet { @@ -159,13 +159,13 @@ class SerializedPageWriter : public PageWriter { data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR) { // prepare the add for quick update later - data_pageAAD_ = parquet_encryption::createModuleAAD( - data_encryptor_->file_aad(), parquet_encryption::DataPage, row_group_ordinal_, + data_pageAAD_ = encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { - data_page_headerAAD_ = parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), parquet_encryption::DataPageHeader, + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); @@ -220,8 +220,8 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), parquet_encryption::DictionaryPageHeader, + meta_encryptor_->update_aad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDictionaryPageHeader, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } int64_t header_size = @@ -242,8 +242,8 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback); if (meta_encryptor_ != nullptr) { - meta_encryptor_->update_aad(parquet_encryption::createModuleAAD( - meta_encryptor_->file_aad(), parquet_encryption::ColumnMetaData, + meta_encryptor_->update_aad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk @@ -288,7 +288,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { - parquet_encryption::quickUpdatePageAAD(data_pageAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); data_encryptor_->update_aad(data_pageAAD_); PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( data_encryptor_->CiphertextSizeDelta() + output_data_len)); @@ -311,7 +311,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - parquet_encryption::quickUpdatePageAAD(data_page_headerAAD_, page_ordinal_); + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); meta_encryptor_->update_aad(data_page_headerAAD_); } int64_t header_size = diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 69c0d3965999..489c8fec72ec 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -32,6 +32,7 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/file_writer.h" #include "parquet/internal_file_decryptor.h" @@ -40,7 +41,6 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/crypto.h" #include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index f16912151277..67c85d23af40 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -26,10 +26,9 @@ #include "parquet/deprecated_io.h" #include "parquet/exception.h" #include "parquet/platform.h" +#include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" #include "parquet/schema.h" -#include "parquet/util/crypto.h" -#include "parquet/util/memory.h" using arrow::MemoryPool; diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 11e44b7415da..b2232fa80c62 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -17,7 +17,7 @@ #include "parquet/internal_file_decryptor.h" #include "parquet/encryption.h" -#include "parquet/util/crypto.h" +#include "parquet/encryption_internal.h" namespace parquet { @@ -34,7 +34,7 @@ FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& file_aad, const std::string& aad) : key_(key), file_aad_(file_aad), aad_(aad) { - aes_encryptor_.reset(new parquet_encryption::AesEncryptor( + aes_encryptor_.reset(encryption::AesEncryptor::Make( algorithm, static_cast(key_.size()), true, NULLPTR)); } @@ -51,7 +51,7 @@ int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int foote } // Decryptor -Decryptor::Decryptor(parquet_encryption::AesDecryptor* aes_decryptor, +Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, const std::string& file_aad, const std::string& aad) : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} @@ -80,8 +80,8 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie } properties_->set_utilized(); - all_decryptors_ = std::shared_ptr>( - new std::list); + all_decryptors_ = std::shared_ptr>( + new std::vector); column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath>>( @@ -126,7 +126,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { "plaintext footer metadata"); } - std::string aad = parquet_encryption::createFooterAAD(file_aad_); + std::string aad = encryption::CreateFooterAad(file_aad_); footer_signing_encryptor_ = std::make_shared(algorithm_, footer_key, file_aad_, aad); @@ -134,7 +134,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { } std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { - std::string aad = parquet_encryption::createFooterAAD(file_aad_); + std::string aad = encryption::CreateFooterAad(file_aad_); return GetFooterDecryptor(aad, true); } @@ -255,24 +255,24 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( return data_decryptor; } -parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( +encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + meta_decryptor_128_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + meta_decryptor_196_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + meta_decryptor_256_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_256_.get(); @@ -280,24 +280,24 @@ parquet_encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -parquet_encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( +encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(new parquet_encryption::AesDecryptor( + data_decryptor_128_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(new parquet_encryption::AesDecryptor( + data_decryptor_196_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(new parquet_encryption::AesDecryptor( + data_decryptor_256_.reset(encryption::AesDecryptor::Make( algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_256_.get(); diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 4a8e0e7b0d49..3013183bf945 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -18,19 +18,19 @@ #ifndef INTERNAL_FILE_DECRYPTOR_H #define INTERNAL_FILE_DECRYPTOR_H -#include #include #include #include +#include #include "parquet/schema.h" -namespace parquet_encryption { +namespace parquet { + +namespace encryption { class AesDecryptor; class AesEncryptor; -} // namespace parquet_encryption - -namespace parquet { +} // namespace encryption class FileDecryptionProperties; @@ -47,12 +47,12 @@ class FooterSigningEncryptor { std::string file_aad_; std::string aad_; - std::shared_ptr aes_encryptor_; + std::shared_ptr aes_encryptor_; }; class Decryptor { public: - Decryptor(parquet_encryption::AesDecryptor* decryptor, const std::string& key, + Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } @@ -62,7 +62,7 @@ class Decryptor { int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); private: - parquet_encryption::AesDecryptor* aes_decryptor_; + encryption::AesDecryptor* aes_decryptor_; std::string key_; std::string file_aad_; std::string aad_; @@ -115,14 +115,14 @@ class InternalFileDecryptor { ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_signing_encryptor_; - std::shared_ptr> all_decryptors_; + std::shared_ptr> all_decryptors_; - std::unique_ptr meta_decryptor_128_; - std::unique_ptr meta_decryptor_196_; - std::unique_ptr meta_decryptor_256_; - std::unique_ptr data_decryptor_128_; - std::unique_ptr data_decryptor_196_; - std::unique_ptr data_decryptor_256_; + std::unique_ptr meta_decryptor_128_; + std::unique_ptr meta_decryptor_196_; + std::unique_ptr meta_decryptor_256_; + std::unique_ptr data_decryptor_128_; + std::unique_ptr data_decryptor_196_; + std::unique_ptr data_decryptor_256_; std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( @@ -130,8 +130,8 @@ class InternalFileDecryptor { const std::string& column_key_metadata, const std::string& aad, bool metadata = false); - parquet_encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); - parquet_encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); + encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); }; } // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 99f18b38fbd1..dabc804b0ad4 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -17,7 +17,7 @@ #include "parquet/internal_file_encryptor.h" #include "parquet/encryption.h" -#include "parquet/util/crypto.h" +#include "parquet/encryption_internal.h" namespace parquet { @@ -29,7 +29,7 @@ static inline uint8_t* str2bytes(const std::string& str) { } // Encryptor -Encryptor::Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, +Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad) : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} @@ -45,8 +45,8 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { - all_encryptors_ = std::shared_ptr>( - new std::list); + all_encryptors_ = std::shared_ptr>( + new std::vector); column_data_map_ = std::shared_ptr< std::map, std::shared_ptr, @@ -75,7 +75,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); std::shared_ptr encryptor = std::make_shared( @@ -90,7 +90,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { } ParquetCipher::type algorithm = properties_->algorithm().algorithm; - std::string footer_aad = parquet_encryption::createFooterAAD(properties_->file_aad()); + std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); std::shared_ptr encryptor = std::make_shared( @@ -149,24 +149,24 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( return encryptor; } -parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( +encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + meta_encryptor_128_.reset(encryption::AesEncryptor::Make( algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_128_.get(); } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + meta_encryptor_196_.reset(encryption::AesEncryptor::Make( algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_196_.get(); } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + meta_encryptor_256_.reset(encryption::AesEncryptor::Make( algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_256_.get(); @@ -174,24 +174,24 @@ parquet_encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -parquet_encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( +encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset(new parquet_encryption::AesEncryptor( + data_encryptor_128_.reset(encryption::AesEncryptor::Make( algorithm, key_len, false, all_encryptors_)); } return data_encryptor_128_.get(); } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset(new parquet_encryption::AesEncryptor( + data_encryptor_196_.reset(encryption::AesEncryptor::Make( algorithm, key_len, false, all_encryptors_)); } return data_encryptor_196_.get(); } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset(new parquet_encryption::AesEncryptor( + data_encryptor_256_.reset(encryption::AesEncryptor::Make( algorithm, key_len, false, all_encryptors_)); } return data_encryptor_256_.get(); diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 7b382de461ee..b10367e45d0b 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -18,26 +18,27 @@ #ifndef INTERNAL_FILE_ENCRYPTOR_H #define INTERNAL_FILE_ENCRYPTOR_H -#include #include #include #include +#include #include "parquet/encryption.h" #include "parquet/schema.h" -namespace parquet_encryption { -class AesEncryptor; -} namespace parquet { +namespace encryption { +class AesEncryptor; +} // namespace encryption + class FileEncryptionProperties; class ColumnEncryptionProperties; class Encryptor { public: - Encryptor(parquet_encryption::AesEncryptor* aes_encryptor, const std::string& key, + Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } void update_aad(const std::string& aad) { aad_ = aad; } @@ -58,7 +59,7 @@ class Encryptor { } private: - parquet_encryption::AesEncryptor* aes_encryptor_; + encryption::AesEncryptor* aes_encryptor_; std::string key_; std::string file_aad_; std::string aad_; @@ -91,21 +92,21 @@ class InternalFileEncryptor { std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; - std::shared_ptr> all_encryptors_; + std::shared_ptr> all_encryptors_; - std::unique_ptr meta_encryptor_128_; - std::unique_ptr meta_encryptor_196_; - std::unique_ptr meta_encryptor_256_; - std::unique_ptr data_encryptor_128_; - std::unique_ptr data_encryptor_196_; - std::unique_ptr data_encryptor_256_; + std::unique_ptr meta_encryptor_128_; + std::unique_ptr meta_encryptor_196_; + std::unique_ptr meta_encryptor_256_; + std::unique_ptr data_encryptor_128_; + std::unique_ptr data_encryptor_196_; + std::unique_ptr data_encryptor_256_; std::shared_ptr GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata); - parquet_encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, + encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, size_t key_len); - parquet_encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, + encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, size_t key_len); }; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 460311724ace..85c552c12941 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -198,8 +198,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(file_decryptor != NULLPTR); - std::string aad_column_metadata = parquet_encryption::createModuleAAD( - file_decryptor->file_aad(), parquet_encryption::ColumnMetaData, + std::string aad_column_metadata = encryption::CreateModuleAad( + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, column_ordinal, (int16_t)-1); auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, aad_column_metadata); @@ -471,15 +471,15 @@ class FileMetaData::FileMetaDataImpl { // encrypt with nonce uint8_t* nonce = const_cast(reinterpret_cast(tail)); uint8_t* tag = const_cast(reinterpret_cast(tail)) + - parquet_encryption::NonceLength; + encryption::kNonceLength; std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + serialized_len); uint32_t encrypted_len = encryptor->SignedFooterEncrypt( serialized_data, serialized_len, nonce, encrypted_buffer.data()); return 0 == memcmp(encrypted_buffer.data() + encrypted_len - - parquet_encryption::GCMTagLength, - tag, parquet_encryption::GCMTagLength); + encryption::kGcmTagLength, + tag, encryption::kGcmTagLength); } inline uint32_t size() const { return metadata_len_; } @@ -524,9 +524,9 @@ class FileMetaData::FileMetaDataImpl { // write unencrypted footer dst->Write(serialized_data, serialized_len); // Write signature (nonce and tag) - dst->Write(encrypted_data.data() + 4, parquet_encryption::NonceLength); - dst->Write(encrypted_data.data() + encrypted_len - parquet_encryption::GCMTagLength, - parquet_encryption::GCMTagLength); + dst->Write(encrypted_data.data() + 4, encryption::kNonceLength); + dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, + encryption::kGcmTagLength); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 73b1ab007c5d..9307ed193fb3 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -42,12 +42,14 @@ #include #include "arrow/util/logging.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/internal_file_decryptor.h" #include "parquet/internal_file_encryptor.h" #include "parquet/statistics.h" -#include "parquet/util/crypto.h" +#include "parquet/types.h" +#include "parquet/util/memory.h" #include "parquet/parquet_types.h" // IYWU pragma: export From 9910905274891b33828c7fa1b87ebf09b9d2e763 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 30 May 2019 13:22:27 +0700 Subject: [PATCH 095/201] format code --- cpp/src/parquet/column_reader.cc | 14 ++++----- cpp/src/parquet/column_writer.cc | 21 +++++++------ cpp/src/parquet/file_reader.cc | 18 +++++------ cpp/src/parquet/internal_file_decryptor.cc | 35 ++++++++++------------ cpp/src/parquet/internal_file_encryptor.cc | 29 +++++++++--------- cpp/src/parquet/internal_file_encryptor.h | 5 ++-- cpp/src/parquet/metadata.cc | 10 +++---- 7 files changed, 63 insertions(+), 69 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ed077c0821ac..e74775e4b821 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -141,15 +141,15 @@ class SerializedPageReader : public PageReader { if (data_decryptor_ != NULLPTR) { DCHECK(!data_decryptor_->file_aad().empty()); // prepare the AAD for quick update later - data_pageAAD_ = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = + encryption::CreateModuleAad(data_decryptor_->file_aad(), encryption::kDataPage, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_decryptor_ != NULLPTR) { DCHECK(!meta_decryptor_->file_aad().empty()); data_page_headerAAD_ = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDataPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1); + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, (int16_t)-1); } } @@ -260,8 +260,8 @@ std::shared_ptr SerializedPageReader::NextPage() { if (current_page_is_dictionary) { std::string dictionary_page_aad; dictionary_page_aad = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDictionaryPage, - row_group_ordinal_, column_ordinal_, (int16_t)-1); + data_decryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, + column_ordinal_, (int16_t)-1); data_decryptor_->update_aad(dictionary_page_aad); } else { encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f540efc1d5da..699cd087baef 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -159,14 +159,14 @@ class SerializedPageWriter : public PageWriter { data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR) { // prepare the add for quick update later - data_pageAAD_ = encryption::CreateModuleAad( - data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + data_pageAAD_ = + encryption::CreateModuleAad(data_encryptor_->file_aad(), encryption::kDataPage, + row_group_ordinal_, column_ordinal_, (int16_t)-1); } if (meta_encryptor_ != NULLPTR) { data_page_headerAAD_ = encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kDataPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1); + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, (int16_t)-1); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -195,10 +195,9 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { data_encryptor_->update_aad( - parquet_encryption::createModuleAAD(data_encryptor_->file_aad(), - parquet_encryption::DictionaryPage, - row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, + column_ordinal_, (int16_t)-1)); encrypted_data_buffer = std::static_pointer_cast( AllocateBuffer(pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -243,8 +242,8 @@ class SerializedPageWriter : public PageWriter { fallback); if (meta_encryptor_ != nullptr) { meta_encryptor_->update_aad(encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kColumnMetaData, - row_group_ordinal_, column_ordinal_, (int16_t)-1)); + meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, + column_ordinal_, (int16_t)-1)); } // Write metadata at end of column chunk metadata_->WriteTo(sink_.get(), meta_encryptor_); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 489c8fec72ec..07851437f6da 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -59,9 +59,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -71,9 +71,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } @@ -487,9 +487,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) << "The file only has " - << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) + << "The file only has " << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index b2232fa80c62..d46101a819b6 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -51,9 +51,8 @@ int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int foote } // Decryptor -Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad) +Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, + const std::string& file_aad, const std::string& aad) : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } @@ -255,50 +254,48 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( return data_decryptor; } -encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor( - size_t key_size) { +encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, true, all_decryptors_)); + meta_decryptor_128_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_128_.get(); } else if (key_len == 24) { if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, true, all_decryptors_)); + meta_decryptor_196_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_196_.get(); } else if (key_len == 32) { if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, true, all_decryptors_)); + meta_decryptor_256_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); } return meta_decryptor_256_.get(); } throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); } -encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor( - size_t key_size) { +encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); if (key_len == 16) { if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, false, all_decryptors_)); + data_decryptor_128_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_128_.get(); } else if (key_len == 24) { if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, false, all_decryptors_)); + data_decryptor_196_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_196_.get(); } else if (key_len == 32) { if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset(encryption::AesDecryptor::Make( - algorithm_, key_len, false, all_decryptors_)); + data_decryptor_256_.reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); } return data_decryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index dabc804b0ad4..b558f5c7fb84 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -29,9 +29,8 @@ static inline uint8_t* str2bytes(const std::string& str) { } // Encryptor -Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, - const std::string& key, const std::string& file_aad, - const std::string& aad) +Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, + const std::string& file_aad, const std::string& aad) : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } @@ -154,20 +153,20 @@ encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, true, all_encryptors_)); + meta_encryptor_128_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_128_.get(); } else if (key_len == 24) { if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, true, all_encryptors_)); + meta_encryptor_196_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_196_.get(); } else if (key_len == 32) { if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, true, all_encryptors_)); + meta_encryptor_256_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); } return meta_encryptor_256_.get(); } @@ -179,20 +178,20 @@ encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( int key_len = static_cast(key_size); if (key_len == 16) { if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, false, all_encryptors_)); + data_encryptor_128_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); } return data_encryptor_128_.get(); } else if (key_len == 24) { if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, false, all_encryptors_)); + data_encryptor_196_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); } return data_encryptor_196_.get(); } else if (key_len == 32) { if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset(encryption::AesEncryptor::Make( - algorithm, key_len, false, all_encryptors_)); + data_encryptor_256_.reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); } return data_encryptor_256_.get(); } diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index b10367e45d0b..9fc0227e45f0 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -26,7 +26,6 @@ #include "parquet/encryption.h" #include "parquet/schema.h" - namespace parquet { namespace encryption { @@ -105,9 +104,9 @@ class InternalFileEncryptor { const std::shared_ptr& column_path, bool metadata); encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm, - size_t key_len); + size_t key_len); encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, - size_t key_len); + size_t key_len); }; } // namespace parquet diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 85c552c12941..aa05c290d1c3 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -199,8 +199,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(file_decryptor != NULLPTR); std::string aad_column_metadata = encryption::CreateModuleAad( - file_decryptor->file_aad(), encryption::kColumnMetaData, - row_group_ordinal, column_ordinal, (int16_t)-1); + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, + column_ordinal, (int16_t)-1); auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, aad_column_metadata); uint32_t len = static_cast(column->encrypted_column_metadata.size()); @@ -477,9 +477,9 @@ class FileMetaData::FileMetaDataImpl { serialized_len); uint32_t encrypted_len = encryptor->SignedFooterEncrypt( serialized_data, serialized_len, nonce, encrypted_buffer.data()); - return 0 == memcmp(encrypted_buffer.data() + encrypted_len - - encryption::kGcmTagLength, - tag, encryption::kGcmTagLength); + return 0 == + memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, + tag, encryption::kGcmTagLength); } inline uint32_t size() const { return metadata_len_; } From 4f0796a0c37e1780e1f2cf5dcfe8f3477522371f Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 2 Jun 2019 09:29:12 +0700 Subject: [PATCH 096/201] post-rebase change --- cpp/src/parquet/column_reader.cc | 23 ++++++++--------- cpp/src/parquet/column_writer.cc | 21 +++++++-------- cpp/src/parquet/encryption.h | 1 - cpp/src/parquet/file_reader.cc | 23 ++++++++--------- cpp/src/parquet/file_writer.cc | 44 ++++++++++++++++++++------------ cpp/src/parquet/file_writer.h | 8 ++++-- cpp/src/parquet/metadata.cc | 10 +++++--- cpp/src/parquet/metadata.h | 2 +- cpp/src/parquet/thrift.h | 9 +++---- 9 files changed, 75 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e74775e4b821..120f9d929e51 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -120,8 +120,7 @@ class SerializedPageReader : public PageReader { SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, bool column_has_dictionary, int16_t row_group_ordinal, - int16_t column_ordinal, - ::arrow::MemoryPool* pool, + int16_t column_ordinal, ::arrow::MemoryPool* pool, std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) : stream_(stream), @@ -236,8 +235,7 @@ std::shared_ptr SerializedPageReader::NextPage() { } } DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, - meta_decryptor_); + &header_size, ¤t_page_header_, meta_decryptor_); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -283,10 +281,10 @@ std::shared_ptr SerializedPageReader::NextPage() { if (data_decryptor_ != nullptr) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( compressed_len - data_decryptor_->CiphertextSizeDelta())); - compressed_len = data_decryptor_->Decrypt(buffer, compressed_len, + compressed_len = data_decryptor_->Decrypt(page_buffer->data(), compressed_len, decryption_buffer_->mutable_data()); - buffer = decryption_buffer_->data(); + page_buffer = decryption_buffer_; } // Uncompress it if we need to @@ -357,12 +355,13 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool, - bool column_has_dictionary, int16_t row_group_ordinal, int16_t column_ordinal, - std::shared_ptr meta_decryptor, std::shared_ptr data_decryptor) { - return std::unique_ptr( - new SerializedPageReader(stream, total_num_rows, codec, pool, column_has_dictionary, - row_group_ordinal, column_ordinal, meta_decryptor, data_decryptor)); + Compression::type codec, ::arrow::MemoryPool* pool, bool column_has_dictionary, + int16_t row_group_ordinal, int16_t column_ordinal, + std::shared_ptr meta_decryptor, + std::shared_ptr data_decryptor) { + return std::unique_ptr(new SerializedPageReader( + stream, total_num_rows, codec, column_has_dictionary, row_group_ordinal, + column_ordinal, pool, meta_decryptor, data_decryptor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 699cd087baef..9c82db5a046a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -194,12 +194,11 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { - data_encryptor_->update_aad( - encryption::CreateModuleAad( - data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1)); - encrypted_data_buffer = std::static_pointer_cast( - AllocateBuffer(pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); + data_encryptor_->update_aad(encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, + column_ordinal_, (int16_t)-1)); + encrypted_data_buffer = std::static_pointer_cast(AllocateBuffer( + pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); @@ -373,13 +372,11 @@ class BufferedPageWriter : public PageWriter { MemoryPool* pool = arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) - : final_sink_(sink), - metadata_(metadata) { + : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, - row_group_ordinal, current_column_ordinal, pool, - meta_encryptor, data_encryptor)); + pager_ = std::unique_ptr(new SerializedPageWriter( + in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, + meta_encryptor, data_encryptor)); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 89541f75c266..3d4ec2163b0d 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -29,7 +29,6 @@ #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/visibility.h" namespace parquet { diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 07851437f6da..c079f2279b63 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -41,7 +41,6 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/memory.h" namespace parquet { @@ -124,7 +123,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); - std::unique_ptr crypto_meta_data = col->crypto_meta_data(); + std::unique_ptr crypto_metadata = col->crypto_metadata(); bool encrypted = true; @@ -136,7 +135,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (!encrypted) { return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i/* column_ordinal */, ); + row_group_ordinal_, (int16_t)i /* column_ordinal */); } // The column is encrypted @@ -233,7 +232,7 @@ class SerializedFile : public ParquetFileReader::Contents { // Check if all bytes are read. Check if last 4 bytes read have the magic bits if (footer_buffer->size() != footer_read_size || (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } @@ -323,8 +322,8 @@ class SerializedFile : public ParquetFileReader::Contents { // encryption with encrypted footer // both metadata & crypto metadata length uint32_t footer_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; if (kFooterSize + footer_len > file_size) { throw ParquetException( @@ -385,16 +384,16 @@ class SerializedFile : public ParquetFileReader::Contents { uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid encrypted parquet file. " - "Could not read footer metadata bytes."); + throw ParquetException( + "Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); } auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), - &metadata_len, - footer_decryptor); + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 67c85d23af40..894100262173 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -24,10 +24,10 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" -#include "parquet/exception.h" -#include "parquet/platform.h" #include "parquet/encryption_internal.h" +#include "parquet/exception.h" #include "parquet/internal_file_encryptor.h" +#include "parquet/platform.h" #include "parquet/schema.h" using arrow::MemoryPool; @@ -79,9 +79,8 @@ inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) { class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, - RowGroupMetaDataBuilder* metadata, - int16_t row_group_ordinal, const WriterProperties* properties, - bool buffered_row_group = false, + RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, + const WriterProperties* properties, bool buffered_row_group = false, InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), @@ -296,14 +295,17 @@ class FileSerializer : public ParquetFileWriter::Contents { // encrypted footer file_metadata_ = metadata_->Finish(); - uint64_t metadata_start = static_cast(sink_->Tell()); + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint64_t metadata_start = static_cast(position); auto crypto_metadata = metadata_->GetCryptoMetaData(); WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); uint32_t footer_and_crypto_len = - static_cast(sink_->Tell() - metadata_start); + static_cast(position - metadata_start); sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); } else { // Encrypted file with plaintext footer @@ -346,7 +348,7 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, (int16_t)(num_row_groups_-1), properties_.get(), + sink_, rg_metadata, (int16_t)(num_row_groups_ - 1), properties_.get(), buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); @@ -399,8 +401,7 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties)); if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); - } - else { + } else { // plaintext mode footer PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } @@ -455,14 +456,17 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { // Encrypted file - if (encrypt_footer) { // Encrypted file with encrypted footer + } else { // Encrypted file + if (encrypt_footer) { // Encrypted file with encrypted footer // encrypt and write to sink file_metadata.WriteTo(sink, encryptor); } else { // Encrypted file with plaintext footer - uint32_t metadata_len = static_cast(sink->Tell()); + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); file_metadata.WriteTo(sink, encryptor); - metadata_len = static_cast(sink->Tell()) - metadata_len; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; sink->Write(reinterpret_cast(&metadata_len), 4); sink->Write(kParquetMagic, 4); @@ -470,15 +474,21 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin } } -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, bool encrypt_footer) { ParquetOutputWrapper wrapper(sink); - return WriteFileMetaData(file_metadata, &wrapper); + return WriteFileMetaData(file_metadata, &wrapper, encryptor, encrypt_footer); +} + +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ArrowOutputStream* sink) { + crypto_metadata.WriteTo(sink); } void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink) { ParquetOutputWrapper wrapper(sink); - crypto_metadata.WriteTo(sink); + crypto_metadata.WriteTo(&wrapper); } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 15b45d38377b..a1ec8183647f 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -105,8 +105,12 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); +void WriteFileMetaData(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor = NULLPTR, + bool encrypt_footer = false); +PARQUET_EXPORT +void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, + ::arrow::io::OutputStream* sink); PARQUET_EXPORT void WriteMetaDataFile(const FileMetaData& file_metadata, diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index aa05c290d1c3..291930f93daf 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -695,7 +695,7 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { } void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, - const std::shared_ptr& encryptor) const { + const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); } @@ -748,7 +748,9 @@ FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) { FileCryptoMetaData::~FileCryptoMetaData() {} -void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } +void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { + impl_->WriteTo(dst); +} ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) @@ -923,7 +925,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { + const std::shared_ptr& encryptor) { ThriftSerializer serializer; // column is unencrypted @@ -1075,7 +1077,7 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { + const std::shared_ptr& encryptor) { impl_->WriteTo(sink, encryptor); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index cb12197de734..83f2d4a7ce63 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -302,7 +302,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // For writing metadata at end of column chunk void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR); + const std::shared_ptr& encryptor = NULLPTR); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 9307ed193fb3..709f17970b0a 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -44,12 +44,11 @@ #include "arrow/util/logging.h" #include "parquet/encryption_internal.h" #include "parquet/exception.h" -#include "parquet/platform.h" #include "parquet/internal_file_decryptor.h" #include "parquet/internal_file_encryptor.h" +#include "parquet/platform.h" #include "parquet/statistics.h" #include "parquet/types.h" -#include "parquet/util/memory.h" #include "parquet/parquet_types.h" // IYWU pragma: export @@ -307,11 +306,11 @@ class ThriftSerializer { } if (shouldWriteLength) { - PARQUET_THROW_NOT_OK(out->Write(reinterpret_cast(&cipher_buffer_len), 4)); + PARQUET_THROW_NOT_OK( + out->Write(reinterpret_cast(&cipher_buffer_len), 4)); PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); return static_cast(cipher_buffer_len + 4); - } - else { + } else { PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); return static_cast(cipher_buffer_len); } From 8eb339b851bab9500c86bb112916989e131bdcb7 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 2 Jun 2019 14:12:15 +0700 Subject: [PATCH 097/201] add unit tests for encryption properties --- cpp/src/parquet/CMakeLists.txt | 4 + cpp/src/parquet/encryption-test.cc | 292 +++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 cpp/src/parquet/encryption-test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 96a84ff27eb9..77de75553f0c 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -321,6 +321,10 @@ add_parquet_test(arrow-test arrow/arrow_schema_test.cc test_util.cc) +if(PARQUET_BUILD_ENCRYPTION) + add_parquet_test(encryption-test) +endif() + # Those tests need to use static linking as they access thrift-generated # symbols which are not exported by parquet.dll on Windows (PARQUET-1420). add_parquet_test(file_deserialize_test diff --git a/cpp/src/parquet/encryption-test.cc b/cpp/src/parquet/encryption-test.cc new file mode 100644 index 000000000000..e4d4a27ed7f4 --- /dev/null +++ b/cpp/src/parquet/encryption-test.cc @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "parquet/encryption.h" + +namespace parquet { + +using schema::ColumnPath; + +namespace test { + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; +const std::string kFileName = "tester"; + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1->ToDotString(), column_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(false, column_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, column_props_1->key()); + ASSERT_EQ("kc1", column_props_1->key_metadata()); +} + +TEST(TestColumnEncryptionProperties, ColumnEncryptedWithFooterKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + std::shared_ptr column_props_1 = column_builder_1.build(); + + ASSERT_EQ(column_path_1->ToDotString(), column_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, column_props_1->is_encrypted()); + ASSERT_EQ(true, column_props_1->is_encrypted_with_footer_key()); +} + +// Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST(TestEncryptionProperties, UniformEncryption) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ("kf", props->footer_key_metadata()); + + std::shared_ptr column_path = + parquet::schema::ColumnPath::FromDotString("a_column"); + std::shared_ptr out_col_props = + props->column_properties(column_path); + + ASSERT_EQ(true, out_col_props->is_encrypted()); + ASSERT_EQ(true, out_col_props->is_encrypted_with_footer_key()); +} + +// Encrypt two columns with their own keys and the same key for +// the footer and other columns +TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties; + column_properties[column_path_1] = column_builder_1.build(); + column_properties[column_path_2] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.column_properties(column_properties); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(true, props->encrypted_footer()); + ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_properties(column_path_1); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_properties(column_path_2); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Encryption configuration 3: Encrypt two columns, don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnEncryptionProperties::Builder column_builder_1(column_path_1); + column_builder_1.key(kColumnEncryptionKey1); + column_builder_1.key_id("kc1"); + + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + ColumnEncryptionProperties::Builder column_builder_2(column_path_2); + column_builder_2.key(kColumnEncryptionKey2); + column_builder_2.key_id("kc2"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties; + column_properties[column_path_1] = column_builder_1.build(); + column_properties[column_path_2] = column_builder_2.build(); + + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.footer_key_metadata("kf"); + builder.set_plaintext_footer(); + builder.column_properties(column_properties); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(false, props->encrypted_footer()); + ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + + std::shared_ptr out_col_props_1 = + props->column_properties(column_path_1); + + ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_1->is_encrypted()); + ASSERT_EQ(false, out_col_props_1->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, out_col_props_1->key()); + ASSERT_EQ("kc1", out_col_props_1->key_metadata()); + + std::shared_ptr out_col_props_2 = + props->column_properties(column_path_2); + + ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); + ASSERT_EQ(true, out_col_props_2->is_encrypted()); + ASSERT_EQ(false, out_col_props_2->is_encrypted_with_footer_key()); + ASSERT_EQ(kColumnEncryptionKey2, out_col_props_2->key()); + ASSERT_EQ("kc2", out_col_props_2->key_metadata()); + + // other columns: encrypted with footer, footer is not encrypted + // so column is not encrypted as well + std::shared_ptr column_path_3 = + parquet::schema::ColumnPath::FromDotString("column_3"); + std::shared_ptr out_col_props_3 = + props->column_properties(column_path_3); + + ASSERT_EQ(NULLPTR, out_col_props_3); +} + +// Use aad_prefix +TEST(TestEncryptionProperties, UseAadPrefix) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->algorithm().aad.aad_prefix); + ASSERT_EQ(false, props->algorithm().aad.supply_aad_prefix); +} + +// Use aad_prefix and +// disable_aad_prefix_storage. +TEST(TestEncryptionProperties, UseAadPrefixNotStoreInFile) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + builder.disable_store_aad_prefix_storage(); + std::shared_ptr props = builder.build(); + + ASSERT_EQ("", props->algorithm().aad.aad_prefix); + ASSERT_EQ(true, props->algorithm().aad.supply_aad_prefix); +} + +// Use AES_GCM_CTR_V1 algorithm +TEST(TestEncryptionProperties, UseAES_GCM_CTR_V1Algorithm) { + FileEncryptionProperties::Builder builder(kFooterEncryptionKey); + builder.algorithm(ParquetCipher::AES_GCM_CTR_V1); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(ParquetCipher::AES_GCM_CTR_V1, props->algorithm().algorithm); +} + +TEST(TestDecryptionProperties, UseKeyRetriever) { + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder builder; + builder.key_retriever(kr1); + std::shared_ptr props = builder.build(); + + auto out_key_retriever = props->key_retriever(); + ASSERT_EQ(kFooterEncryptionKey, out_key_retriever->GetKey("kf")); + ASSERT_EQ(kColumnEncryptionKey1, out_key_retriever->GetKey("kc1")); + ASSERT_EQ(kColumnEncryptionKey2, out_key_retriever->GetKey("kc2")); +} + +TEST(TestDecryptionProperties, SupplyAadPrefix) { + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.aad_prefix(kFileName); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFileName, props->aad_prefix()); +} + +TEST(ColumnDecryptionProperties, SetKey) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + col_builder_1.key(kColumnEncryptionKey1); + + auto props = col_builder_1.build(); + ASSERT_EQ(kColumnEncryptionKey1, props->key()); +} + +TEST(TestDecryptionProperties, UsingExplicitFooterAndColumnKeys) { + std::shared_ptr column_path_1 = + parquet::schema::ColumnPath::FromDotString("column_1"); + std::shared_ptr column_path_2 = + parquet::schema::ColumnPath::FromDotString("column_2"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder col_builder_1(column_path_1); + parquet::ColumnDecryptionProperties::Builder col_builder_2(column_path_2); + + decryption_cols[column_path_1] = col_builder_1.key(kColumnEncryptionKey1)->build(); + decryption_cols[column_path_2] = col_builder_2.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder builder; + builder.footer_key(kFooterEncryptionKey); + builder.column_properties(decryption_cols); + std::shared_ptr props = builder.build(); + + ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); + ASSERT_EQ(kColumnEncryptionKey1, props->column_key(column_path_1)); + ASSERT_EQ(kColumnEncryptionKey2, props->column_key(column_path_2)); +} + +} // namespace test +} // namespace parquet From 72c455475aacbac101015249f945226902c70a61 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 2 Jun 2019 18:12:04 +0700 Subject: [PATCH 098/201] write unit tests for metadata --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/encryption-metadata-test.cc | 85 +++++++++++++++++++++ cpp/src/parquet/file_writer.cc | 10 +-- cpp/src/parquet/metadata.cc | 31 +++++--- cpp/src/parquet/metadata.h | 4 +- 5 files changed, 110 insertions(+), 21 deletions(-) create mode 100644 cpp/src/parquet/encryption-metadata-test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 77de75553f0c..68f2ae7ec17a 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -323,6 +323,7 @@ add_parquet_test(arrow-test if(PARQUET_BUILD_ENCRYPTION) add_parquet_test(encryption-test) + add_parquet_test(encryption-metadata-test) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc new file mode 100644 index 000000000000..f81493dbed05 --- /dev/null +++ b/cpp/src/parquet/encryption-metadata-test.cc @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/metadata.h" + +#include + +#include "parquet/properties.h" +#include "parquet/schema.h" + +namespace parquet { + +namespace metadata { + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; + +TEST(Metadata, EncryptFooter) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); + encryption_prop_builder.footer_key_metadata("kf"); + + WriterProperties::Builder writer_prop_builder; + writer_prop_builder.encryption(encryption_prop_builder.build()); + auto props = writer_prop_builder.build(); + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto file_metadata = f_builder->Finish(); + ASSERT_EQ(false, file_metadata->is_encryption_algorithm_set()); + + auto file_crypto_metadata = f_builder->GetCryptoMetaData(); + ASSERT_EQ(true, file_crypto_metadata != NULLPTR); +} + +TEST(Metadata, PlaintextFooter) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); + encryption_prop_builder.footer_key_metadata("kf"); + encryption_prop_builder.set_plaintext_footer(); + + WriterProperties::Builder writer_prop_builder; + writer_prop_builder.encryption(encryption_prop_builder.build()); + auto props = writer_prop_builder.build(); + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto file_metadata = f_builder->Finish(); + ASSERT_EQ(true, file_metadata->is_encryption_algorithm_set()); + + auto file_crypto_metadata = f_builder->GetCryptoMetaData(); + ASSERT_EQ(NULLPTR, file_crypto_metadata); +} + +} // namespace metadata +} // namespace parquet diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 894100262173..b6477c0d844a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -309,15 +309,7 @@ class FileSerializer : public ParquetFileWriter::Contents { sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); sink_->Write(kParquetEMagic, 4); } else { // Encrypted file with plaintext footer - EncryptionAlgorithm signing_encryption; - EncryptionAlgorithm algo = file_encryption_properties->algorithm(); - signing_encryption.aad.aad_file_unique = algo.aad.aad_file_unique; - signing_encryption.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; - if (!algo.aad.supply_aad_prefix) - signing_encryption.aad.aad_prefix = algo.aad.aad_prefix; - signing_encryption.algorithm = ParquetCipher::AES_GCM_V1; - file_metadata_ = metadata_->Finish( - &signing_encryption, file_encryption_properties->footer_key_metadata()); + file_metadata_ = metadata_->Finish(); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, false); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 291930f93daf..71d9dc9434c1 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -922,6 +922,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } column_metadata_.__set_encodings(thrift_encodings); + + // temporary fix: setting for columnchunk meta_data in case file is not encrypted + if (properties_->file_encryption_properties() == NULLPTR) { + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(column_metadata_); + } } void WriteTo(::arrow::io::OutputStream* sink, @@ -1235,8 +1241,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return current_row_group_builder_.get(); } - std::unique_ptr Finish(const EncryptionAlgorithm* signing_algorithm, - const std::string& footer_signing_key_metadata) { + std::unique_ptr Finish() { int64_t total_rows = 0; for (auto row_group : row_groups_) { total_rows += row_group.num_rows; @@ -1282,8 +1287,20 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; - if (signing_algorithm != NULLPTR) { - metadata_->__set_encryption_algorithm(ToThrift(*signing_algorithm)); + // if plaintext footer, set footer signing algorithm + auto file_encryption_properties = properties_->file_encryption_properties(); + if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { + EncryptionAlgorithm signing_algorithm; + EncryptionAlgorithm algo = file_encryption_properties->algorithm(); + signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique; + signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix; + if (!algo.aad.supply_aad_prefix) + signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix; + signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1; + + metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm)); + const std::string& footer_signing_key_metadata = + file_encryption_properties->footer_key_metadata(); if (footer_signing_key_metadata.size() > 0) { metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); } @@ -1353,11 +1370,7 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { return impl_->AppendRowGroup(); } -std::unique_ptr FileMetaDataBuilder::Finish( - const EncryptionAlgorithm* signing_algorithm, - const std::string& footer_signing_key_metadata) { - return impl_->Finish(signing_algorithm, footer_signing_key_metadata); -} +std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 83f2d4a7ce63..5d3695a32527 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -354,9 +354,7 @@ class PARQUET_EXPORT FileMetaDataBuilder { RowGroupMetaDataBuilder* AppendRowGroup(); // Complete the Thrift structure - std::unique_ptr Finish( - const EncryptionAlgorithm* signing_algorithm = NULLPTR, - const std::string& footer_signing_key_metadata = ""); + std::unique_ptr Finish(); // crypto metadata std::unique_ptr GetCryptoMetaData(); From c0585f9176349b8a6aa3c6d348feb4366ba4a6c7 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Sun, 2 Jun 2019 15:58:39 +0300 Subject: [PATCH 099/201] Add encryption samples --- cpp/examples/parquet/CMakeLists.txt | 6 +- ...yption-reader-writer-all-crypto-options.cc | 792 +++++++++++++++++ .../low-level-api/encryption-reader-writer.cc | 809 +++++++++--------- cpp/src/parquet/encryption.cc | 10 +- cpp/src/parquet/encryption.h | 259 ++++-- cpp/src/parquet/file_reader.cc | 52 +- 6 files changed, 1389 insertions(+), 539 deletions(-) create mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 2a2421c18ab9..fb4287303607 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -18,12 +18,15 @@ add_executable(parquet-low-level-example low-level-api/reader-writer.cc) add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) +add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_include_directories(parquet-encryption-example PRIVATE low-level-api/) +target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) target_link_libraries(parquet-encryption-example parquet_static) +target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated @@ -37,4 +40,5 @@ add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 parquet-encryption-example - parquet-arrow-example) + parquet-arrow-example + parquet-encryption-example-all-crypto-options) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc new file mode 100644 index 000000000000..98f0e57a7aac --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -0,0 +1,792 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * This file contains samples for writing and reading encrypted Parquet files in different + * encryption and decryption configurations. The samples have the following goals: + * 1) Demonstrate usage of different options for data encryption and decryption. + * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr) + * readers that support encryption. + * 3) Produce encrypted files with plaintext footer, for testing the ability of legacy + * readers to parse the footer and read unencrypted columns. + * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading + * encrypted files produced by these writers. + * + * The write sample produces number of parquet files, each encrypted with a different + * encryption configuration as described below. + * The name of each file is in the form of: + * tester.parquet.encrypted. + * + * The read sample creates a set of decryption configurations and then uses each of them + * to read all encrypted files in the input directory. + * + * The different encryption and decryption configurations are listed below. + * + * Usage: ./encryption-interop-tests + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The write sample creates files with eight columns in the following + * encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer. + * - Encryption configuration 3: Encrypt two columns. Don’t encrypt footer (to enable + * legacy readers) - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer. Supply aad_prefix + * for file identity verification. + * - Encryption configuration 5: Encrypt two columns and the footer. Supply aad_prefix, + * and call disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer. Use the + * alternative (AES_GCM_CTR_V1) algorithm. + * + * The read sample uses each of the following decryption configurations to read every + * encrypted files in the input directory: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + */ + +constexpr int NUM_ROWS_PER_ROW_GROUP = 500; + +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; +const std::string fileName = "tester"; + +void PrintDecryptionConfiguration(int configuration); +void CheckResult(std::string file, int example_id, std::string exception_msg); +int ExtractEncryptionConfigurationNumber(std::string file); + +std::vector GetDirectoryFiles(const std::string& path) { + std::vector files; + struct dirent* entry; + DIR* dir = opendir(path.c_str()); + + if (dir == NULL) { + exit(-1); + } + while ((entry = readdir(dir)) != NULL) { + files.push_back(std::string(entry->d_name)); + } + closedir(dir); + return files; +} + +void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { + /********************************************************************************** + Creating a number of Encryption configurations + **********************************************************************************/ + + // This vector will hold various encryption configuraions. + std::vector> + vector_of_encryption_configurations; + + // Encryption configuration 1: Encrypt all columns and the footer with the same key. + // (uniform encryption) + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey); + // Add to list of encryption configurations. + vector_of_encryption_configurations.push_back( + file_encryption_builder_1.footer_key_metadata("kf")->build()); + + // Encryption configuration 2: Encrypt two columns and the footer. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_ptr1 = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21(path_ptr1); + encryption_col_builder_20.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols2[path_ptr] = encryption_col_builder_20.build(); + encryption_cols2[path_ptr1] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build()); + + // Encryption configuration 3: Encrypt two columns, don’t encrypt footer. + // (plaintext footer mode, readable by legacy readers) + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31(path_ptr1); + encryption_col_builder_30.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols3[path_ptr] = encryption_col_builder_30.build(); + encryption_cols3[path_ptr1] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build()); + + // Encryption configuration 4: Encrypt two columns and the footer. Use aad_prefix. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41(path_ptr1); + encryption_col_builder_40.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols4[path_ptr] = encryption_col_builder_40.build(); + encryption_cols4[path_ptr1] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(fileName) + ->build()); + + // Encryption configuration 5: Encrypt two columns and the footer. Use aad_prefix and + // disable_aad_prefix_storage. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51(path_ptr1); + encryption_col_builder_50.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols5[path_ptr] = encryption_col_builder_50.build(); + encryption_cols5[path_ptr1] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(fileName) + ->disable_store_aad_prefix_storage() + ->build()); + + // Encryption configuration 6: Encrypt two columns and the footer. Use AES_GCM_CTR_V1 + // algorithm. + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61(path_ptr1); + encryption_col_builder_60.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols6[path_ptr] = encryption_col_builder_60.build(); + encryption_cols6[path_ptr1] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey); + + vector_of_encryption_configurations.push_back( + file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build()); + + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + + // Iterate over the encryption configurations and for each one write a parquet file. + for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); + ++example_id) { + std::stringstream ss; + ss << example_id + 1; + std::string test_number_string = ss.str(); + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + std::string file = + rootPath + fileName + std::string(test_number_string) + ".parquet.encrypted"; + std::cout << "Write " << file << std::endl; + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(vector_of_encryption_configurations[example_id]); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return; + } + } +} + +void InteropTestReadEncryptedParquetFiles(std::string rootPath) { + std::vector files_in_directory = GetDirectoryFiles(rootPath); + + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations; + + // Decryption configuration 1: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey); + string_kr2->PutKey("kc1", kColumnEncryptionKey1); + string_kr2->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31(path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey) + ->column_properties(decryption_cols) + ->build()); + + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Iterate over the decryption configurations and use each one to read every files + // in the input directory. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations.size(); + ++example_id) { + PrintDecryptionConfiguration(example_id + 1); + for (auto const& file : files_in_directory) { + std::string exception_msg = ""; + if (file.find("parquet.encrypted") == + std::string::npos) // Skip non encrypted files + continue; + try { + std::cout << "--> Read file " << file << std::endl; + + parquet::ReaderProperties reader_properties = + parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + vector_of_decryption_configurations[example_id]->DeepClone()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(rootPath + file, false, + reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == FIXED_LENGTH); + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(file, example_id, exception_msg); + std::cout << "file [" << file << "] Parquet Reading Complete" << std::endl; + } + } +} + +void PrintDecryptionConfiguration(int configuration) { + std::cout << "\n\nDecryption configuration "; + if (configuration == 1) + std::cout << "1: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key." + << std::endl; + else if (configuration == 2) + std::cout << "2: \n\nDecrypt using key retriever that holds" + " the keys of two encrypted columns and the footer key. Pass aad_prefix." + << std::endl; + else if (configuration == 3) + std::cout << "3: \n\nDecrypt using explicit column and footer keys." << std::endl; + else { + std::cout << "Unknown configuraion" << std::endl; + exit(-1); + } + std::cout << std::endl; +} + +int ExtractEncryptionConfigurationNumber(std::string file) { + int encryption_configuration_number; + std::regex r("tester([0-9]+)\\.parquet.encrypted"); + std::smatch m; + std::regex_search(file, m, r); + if (m.size() == 0) { + std::cerr + << "Error: Error parsing filename to extract encryption configuration number. " + << std::endl; + } + std::string encryption_configuration_number_str = m.str(1); + encryption_configuration_number = atoi(encryption_configuration_number_str.c_str()); + if (encryption_configuration_number < 1 || encryption_configuration_number > 6) { + std::cerr << "Error: Unknown encryption configuration number. " << std::endl; + } + + return encryption_configuration_number; +} + +void CheckResult(std::string file, int example_id, std::string exception_msg) { + int encryption_configuration_number = ExtractEncryptionConfigurationNumber(file); + int decryption_configuration_number = example_id + 1; + + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) + std::cout << "Error: Expecting AAD related exception."; + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected to + // be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + if (found == std::string::npos) { + std::cout << "Error: Expecting AAD related exception." << std::endl; + } + return; + } + } + if (!exception_msg.empty()) + std::cout << "Error: Unexpected exception was thrown." << exception_msg; +} + +int main(int argc, char** argv) { + enum Operation { write, read }; + std::string rootPath; + Operation operation = write; + if (argc < 3) { + std::cout << "Usage: encryption-reader-writer-all-crypto-options " + "" + << std::endl; + exit(1); + } + rootPath = argv[1]; + if (rootPath.compare("read") == 0) { + operation = read; + } + + rootPath = argv[2]; + std::cout << "Root path is: " << rootPath << std::endl; + + if (operation == write) { + InteropTestWriteEncryptedParquetFiles(rootPath); + } else + InteropTestReadEncryptedParquetFiles(rootPath); + + return 0; +} diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index a75c0352cdab..5ce66769c0fb 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -22,471 +22,428 @@ #include - - /* - * This example describes writing and reading Parquet Files in C++ with encrypted columns - * and serves as a reference to the Parquet Modular Encryption API. + * This file contains sample for writing and reading encrypted Parquet file with + * basic encryption configuration. * - * A detailed description of the Parquet Modular Encryption specification can be found here: + * A detailed description of the Parquet Modular Encryption specification can be found + * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * The example contains writing and reading eight columns with the following four different - * encryption configurations: - * 1) uniform encryption - footer and all columns are encrypted with footer key. - * 2) non-uniform encryption - footer and ba_field column are encrypted with different keys. - * 3) plaintext footer mode where all columns are encrypted with footer key. - * 4) footer and ba_field column are encrypted with different keys. no column key - * is provided upon decryption and thus HiddenColumnException is thrown. - **/ + * The write sample creates a file with eight columns where two of the columns and the + * footer are encrypted. + * + * The read sample decrypts using key retriever that holds the keys of two encrypted + * columns and the footer key. + */ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet.encrypted"; -const std::string FOOTER_ENCRYPTION_KEY = "0123456789012345"; // 16 bytes -const std::string COLUMN_ENCRYPTION_KEY = "1234567890123450"; // 16 bytes +const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 +const std::string kColumnEncryptionKey1 = "1234567890123450"; +const std::string kColumnEncryptionKey2 = "1234567890123451"; int main(int argc, char** argv) { - std::vector> vector_of_encryption_configurations; - std::vector> vector_of_decryption_configurations; - - // encryption configuration #1 - uniform encryption - all columns and footer are - // encrypted with footer key. - parquet::FileEncryptionProperties::Builder file_encryption_builder_1(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_1; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_1.build()); - vector_of_decryption_configurations.push_back(file_decryption_builder_1 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->build()); - - // encryption configuration #2 - footer and ba_field column are encrypted with - // different keys. - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> encryption_cols; - std::shared_ptr path_ptr = parquet::schema::ColumnPath::FromDotString("ba_field"); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_0(path_ptr); - encryption_cols[path_ptr] = encryption_col_builder_0. - key(COLUMN_ENCRYPTION_KEY) - ->build(); + /********************************************************************************** + PARQUET WRITER EXAMPLE + **********************************************************************************/ + // Encryption configuration: Encrypt two columns and the footer. std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder2(path_ptr); - decryption_cols[path_ptr] = decryption_col_builder2. - key(COLUMN_ENCRYPTION_KEY) - ->build(); - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_2; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_2 - .column_properties(encryption_cols) - ->build()); - vector_of_decryption_configurations.push_back(file_decryption_builder_2 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->column_properties(decryption_cols) - ->build()); - - // encryption configuration #3 - plain mode footer - parquet::FileEncryptionProperties::Builder file_encryption_builder_3(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_3; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_3 - .set_plaintext_footer() - ->build()); - vector_of_decryption_configurations.push_back(file_decryption_builder_3 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->build()); - - // encryption configuration #4 - footer and ba_field column are encrypted with different keys. - // no column key is provided upon decryption and thus HiddenColumnException is thrown. - parquet::FileEncryptionProperties::Builder file_encryption_builder_4(FOOTER_ENCRYPTION_KEY); - parquet::FileDecryptionProperties::Builder file_decryption_builder_4; - - // Add the properties to the appropriate configurations vectors - vector_of_encryption_configurations.push_back(file_encryption_builder_4 - .set_plaintext_footer() - ->column_properties(encryption_cols) - ->build()); - - vector_of_decryption_configurations.push_back(file_decryption_builder_4 - .footer_key(FOOTER_ENCRYPTION_KEY) - ->build()); - - for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size(); ++example_id) { - /********************************************************************************** - PARQUET WRITER EXAMPLE - **********************************************************************************/ - - try { - - // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); - - // Setup the parquet schema - std::shared_ptr schema = SetupSchema(); - - // Add writer properties - parquet::WriterProperties::Builder builder; - builder.compression(parquet::Compression::SNAPPY); - - // Add the current encryption configuration to WriterProperties. - builder.encryption(vector_of_encryption_configurations[example_id]); - - std::shared_ptr props = builder.build(); - - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - // Append a RowGroup with a specific number of rows. - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols; + std::shared_ptr path_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_ptr1 = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder0(path_ptr); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder1(path_ptr1); + encryption_col_builder0.key(kColumnEncryptionKey1)->key_id("kc1"); + encryption_col_builder1.key(kColumnEncryptionKey2)->key_id("kc2"); + + encryption_cols[path_ptr] = encryption_col_builder0.build(); + encryption_cols[path_ptr1] = encryption_col_builder1.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder( + kFooterEncryptionKey); + + try { + // Create a local file output stream instance. + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file)); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(); + + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + + // Add the current encryption configuration to WriterProperties. + builder.encryption(file_encryption_builder.footer_key_metadata("kf") + ->column_properties(encryption_cols) + ->build()); + + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::ByteArray value; - char hello[FIXED_LENGTH] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = FIXED_LENGTH; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } - flba_writer->WriteBatch(1, nullptr, nullptr, &value); + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::ByteArray value; + char hello[FIXED_LENGTH] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = FIXED_LENGTH; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); } + } - // Close the ParquetFileWriter - file_writer->Close(); + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(rg_writer->NextColumn()); + for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); - // Write the bytes to file - DCHECK(out_file->Close().ok()); - } catch (const std::exception& e) { - std::cerr << "Parquet write error: " << e.what() << std::endl; - return -1; + flba_writer->WriteBatch(1, nullptr, nullptr, &value); } - /********************************************************************************** - PARQUET READER EXAMPLE - **********************************************************************************/ - - try { - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - - // Add the current decryption configuration to ReaderProperties. - reader_properties.file_decryption_properties(vector_of_decryption_configurations[example_id]); - - // Create a ParquetReader instance - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - assert(num_row_groups == 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - assert(value == expected_value); - i++; - } + // Close the ParquetFileWriter + file_writer->Close(); - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - assert(value == i); - i++; - } + // Write the bytes to file + DCHECK(out_file->Close().ok()); + } catch (const std::exception& e) { + std::cerr << "Parquet write error: " << e.what() << std::endl; + return -1; + } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - assert(value == expected_value); - if ((i % 2) == 0) { - assert(repetition_level == 1); - } else { - assert(repetition_level == 0); - } - i++; - } + /********************************************************************************** + PARQUET READER EXAMPLE + **********************************************************************************/ + + // Decryption configuration: Decrypt using key retriever callback that holds the keys + // of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey); + string_kr1->PutKey("kc1", kColumnEncryptionKey1); + string_kr1->PutKey("kc2", kColumnEncryptionKey2); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder; + + + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + + // Add the current decryption configuration to ReaderProperties. + reader_properties.file_decryption_properties( + file_decryption_builder.key_retriever(kr1)->build()); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); - } - i++; - } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - assert(value == expected_value); - i++; + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); } + i++; + } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - double expected_value = i * 1.1111111; - assert(value == expected_value); - i++; + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); } + i++; + } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // Verify the value written - char expected_value[FIXED_LENGTH] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - assert(values_read == 1); - assert(value.len == FIXED_LENGTH); - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - assert(definition_level == 1); - } else { - // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); - } - i++; - } + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[FIXED_LENGTH] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist // There are no NULL values in the rows written assert(values_read == 1); - // Verify the value written - char v = static_cast(i); - char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(value.len == FIXED_LENGTH); assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - i++; + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); } + i++; } - } catch (const parquet::HiddenColumnException& e) { - std::cerr << "Parquet read error: hidden column: " << e.what() << std::endl; - } catch (const std::exception& e) { - std::cerr << "Parquet read error: " << e.what() << std::endl; - } - std::cout << "Example [" << (example_id+1) << "] Parquet Writing and Reading Complete" << std::endl; + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); + i++; + } + } + } catch (const std::exception& e) { + std::cerr << "Parquet read error: " << e.what() << std::endl; } + + std::cout << "Parquet Writing and Reading Complete" << std::endl; return 0; } diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 9666e9ae2bb4..8a88db002240 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -183,6 +183,8 @@ FileEncryptionProperties::FileEncryptionProperties( : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), + aad_prefix_(aad_prefix), + store_aad_prefix_in_file_(store_aad_prefix_in_file), column_properties_(column_properties) { // file encryption properties object can be used for writing only one file. // Upon completion of file writing, the encryption keys in the properties will be wiped @@ -194,11 +196,11 @@ FileEncryptionProperties::FileEncryptionProperties( DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || footer_key.length() == 32); - uint8_t aad_file_unique[AAD_FILE_UNIQUE_LENGTH]; - memset(aad_file_unique, 0, AAD_FILE_UNIQUE_LENGTH); - RAND_bytes(aad_file_unique, sizeof(AAD_FILE_UNIQUE_LENGTH)); + uint8_t aad_file_unique[kAadFileUniqueLength]; + memset(aad_file_unique, 0, kAadFileUniqueLength); + RAND_bytes(aad_file_unique, sizeof(kAadFileUniqueLength)); std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), - AAD_FILE_UNIQUE_LENGTH); + kAadFileUniqueLength); bool supply_aad_prefix = false; if (aad_prefix.empty()) { diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 3d4ec2163b0d..7d943d15be51 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -32,13 +32,13 @@ namespace parquet { -static constexpr ParquetCipher::type DEFAULT_ENCRYPTION_ALGORITHM = +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = ParquetCipher::AES_GCM_V1; -static constexpr int32_t MAXIMAL_AAD_METADATA_LENGTH = 256; -static constexpr bool DEFAULT_ENCRYPTED_FOOTER = true; -static constexpr bool DEFAULT_CHECK_SIGNATURE = true; -static constexpr bool DEFAULT_ALLOW_PLAINTEXT_FILES = false; -static constexpr int32_t AAD_FILE_UNIQUE_LENGTH = 8; +static constexpr int32_t kMaximalAadMetadataLength = 256; +static constexpr bool kDefaultEncryptedFooter = true; +static constexpr bool kDefaultCheckSignature = true; +static constexpr bool kDefaultAllowPlaintextFiles = false; +static constexpr int32_t kAadFileUniqueLength = 8; class PARQUET_EXPORT DecryptionKeyRetriever { public: @@ -46,7 +46,7 @@ class PARQUET_EXPORT DecryptionKeyRetriever { virtual ~DecryptionKeyRetriever() {} }; -// Simple integer key retriever +/// Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: void PutKey(uint32_t key_id, const std::string& key); @@ -88,22 +88,22 @@ class PARQUET_EXPORT ColumnEncryptionProperties { public: class Builder { public: - // Convenience builder for regular (not nested) columns. + /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { Builder(schema::ColumnPath::FromDotString(name), true); } - // Convenience builder for encrypted columns. + /// Convenience builder for encrypted columns. explicit Builder(const std::shared_ptr& path) : Builder(path, true) {} - // Set a column-specific key. - // If key is not set on an encrypted column, the column will - // be encrypted with the footer key. - // keyBytes Key length must be either 16, 24 or 32 bytes. - // The key is cloned, and will be wiped out (array values set to 0) upon completion of - // file writing. - // Caller is responsible for wiping out the input key array. + /// Set a column-specific key. + /// If key is not set on an encrypted column, the column will + /// be encrypted with the footer key. + /// keyBytes Key length must be either 16, 24 or 32 bytes. + /// The key is cloned, and will be wiped out (array values set to 0) upon completion + /// of file writing. + /// Caller is responsible for wiping out the input key array. Builder* key(std::string column_key) { if (column_key.empty()) return this; @@ -112,8 +112,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { return this; } - // Set a key retrieval metadata. - // use either key_metadata() or key_id(), not both + /// Set a key retrieval metadata. + /// use either key_metadata() or key_id(), not both Builder* key_metadata(const std::string& key_metadata) { DCHECK(!key_metadata.empty()); DCHECK(key_metadata_.empty()); @@ -121,9 +121,9 @@ class PARQUET_EXPORT ColumnEncryptionProperties { return this; } - // Set a key retrieval metadata (converted from String). - // use either key_metadata() or key_id(), not both - // key_id will be converted to metadata (UTF-8 array). + /// Set a key retrieval metadata (converted from String). + /// use either key_metadata() or key_id(), not both + /// key_id will be converted to metadata (UTF-8 array). Builder* key_id(const std::string& key_id); std::shared_ptr build() { @@ -160,6 +160,12 @@ class PARQUET_EXPORT ColumnEncryptionProperties { void set_utilized() { utilized_ = true; } + std::shared_ptr DeepClone() { + std::string key_copy = key_; + return std::shared_ptr(new ColumnEncryptionProperties( + encrypted_, column_path_, key_copy, key_metadata_)); + } + ColumnEncryptionProperties() = default; ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default; ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default; @@ -180,17 +186,17 @@ class PARQUET_EXPORT ColumnDecryptionProperties { public: class Builder { public: - // convenience builder for regular (not nested) columns. + /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) : Builder(schema::ColumnPath::FromDotString(name)) {} explicit Builder(const std::shared_ptr& path) : column_path_(path) {} - // Set an explicit column key. If applied on a file that contains - // key metadata for this column the metadata will be ignored, - // the column will be decrypted with this key. - // key length must be either 16, 24 or 32 bytes. + /// Set an explicit column key. If applied on a file that contains + /// key metadata for this column the metadata will be ignored, + /// the column will be decrypted with this key. + /// key length must be either 16, 24 or 32 bytes. Builder* key(const std::string& key) { if (key.empty()) return this; @@ -225,25 +231,31 @@ class PARQUET_EXPORT ColumnDecryptionProperties { } } + std::shared_ptr DeepClone() { + std::string key_copy = key_; + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_copy)); + } + private: const std::shared_ptr column_path_; std::string key_; bool utilized_; - // This class is only required for setting explicit column decryption keys - - // to override key retriever (or to provide keys when key metadata and/or - // key retriever are not available) + /// This class is only required for setting explicit column decryption keys - + /// to override key retriever (or to provide keys when key metadata and/or + /// key retriever are not available) explicit ColumnDecryptionProperties( const std::shared_ptr& column_path, const std::string& key); }; class PARQUET_EXPORT AADPrefixVerifier { public: - // Verifies identity (AAD Prefix) of individual file, - // or of file collection in a data set. - // Throws exception if an AAD prefix is wrong. - // In a data set, AAD Prefixes should be collected, - // and then checked for missing files. + /// Verifies identity (AAD Prefix) of individual file, + /// or of file collection in a data set. + /// Throws exception if an AAD prefix is wrong. + /// In a data set, AAD Prefixes should be collected, + /// and then checked for missing files. virtual void check(const std::string& aad_prefix) = 0; virtual ~AADPrefixVerifier() {} }; @@ -253,22 +265,22 @@ class PARQUET_EXPORT FileDecryptionProperties { class Builder { public: Builder() { - check_plaintext_footer_integrity_ = DEFAULT_CHECK_SIGNATURE; - plaintext_files_allowed_ = DEFAULT_ALLOW_PLAINTEXT_FILES; + check_plaintext_footer_integrity_ = kDefaultCheckSignature; + plaintext_files_allowed_ = kDefaultAllowPlaintextFiles; } - // Set an explicit footer key. If applied on a file that contains - // footer key metadata the metadata will be ignored, the footer - // will be decrypted/verified with this key. - // If explicit key is not set, footer key will be fetched from - // key retriever. - // With explicit keys or AAD prefix, new encryption properties object must be created - // for each encrypted file. - // Explicit encryption keys (footer and column) are cloned. - // Upon completion of file reading, the cloned encryption keys in the properties will - // be wiped out (array values set to 0). - // Caller is responsible for wiping out the input key array. - // param footerKey Key length must be either 16, 24 or 32 bytes. + /// Set an explicit footer key. If applied on a file that contains + /// footer key metadata the metadata will be ignored, the footer + /// will be decrypted/verified with this key. + /// If explicit key is not set, footer key will be fetched from + /// key retriever. + /// With explicit keys or AAD prefix, new encryption properties object must be + /// created for each encrypted file. + /// Explicit encryption keys (footer and column) are cloned. + /// Upon completion of file reading, the cloned encryption keys in the properties + /// will be wiped out (array values set to 0). + /// Caller is responsible for wiping out the input key array. + /// param footerKey Key length must be either 16, 24 or 32 bytes. Builder* footer_key(const std::string footer_key) { if (footer_key.empty()) { return this; @@ -278,12 +290,12 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Set explicit column keys (decryption properties). - // Its also possible to set a key retriever on this property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. + /// Set explicit column keys (decryption properties). + /// Its also possible to set a key retriever on this property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. Builder* column_properties( const std::map, std::shared_ptr, @@ -306,12 +318,12 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Set a key retriever callback. Its also possible to - // set explicit footer or column keys on this file property object. - // Upon file decryption, availability of explicit keys is checked before - // invocation of the retriever callback. - // If an explicit key is available for a footer or a column, - // its key metadata will be ignored. + /// Set a key retriever callback. Its also possible to + /// set explicit footer or column keys on this file property object. + /// Upon file decryption, availability of explicit keys is checked before + /// invocation of the retriever callback. + /// If an explicit key is available for a footer or a column, + /// its key metadata will be ignored. Builder* key_retriever(const std::shared_ptr& key_retriever) { if (key_retriever == NULLPTR) return this; @@ -320,21 +332,21 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Skip integrity verification of plaintext footers. - // If not called, integrity of plaintext footers will be checked in runtime, - // and an exception will be thrown in the following situations: - // - footer signing key is not available - // (not passed, or not found by key retriever) - // - footer content and signature don't match + /// Skip integrity verification of plaintext footers. + /// If not called, integrity of plaintext footers will be checked in runtime, + /// and an exception will be thrown in the following situations: + /// - footer signing key is not available + /// (not passed, or not found by key retriever) + /// - footer content and signature don't match Builder* disable_footer_signature_verification() { check_plaintext_footer_integrity_ = false; return this; } - // Explicitly supply the file AAD prefix. - // A must when a prefix is used for file encryption, but not stored in file. - // If AAD prefix is stored in file, it will be compared to the explicitly - // supplied value and an exception will be thrown if they differ. + /// Explicitly supply the file AAD prefix. + /// A must when a prefix is used for file encryption, but not stored in file. + /// If AAD prefix is stored in file, it will be compared to the explicitly + /// supplied value and an exception will be thrown if they differ. Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) { return this; @@ -344,7 +356,7 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // Set callback for verification of AAD Prefixes stored in file. + /// Set callback for verification of AAD Prefixes stored in file. Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { if (aad_prefix_verifier == NULLPTR) return this; @@ -353,12 +365,12 @@ class PARQUET_EXPORT FileDecryptionProperties { return this; } - // By default, reading plaintext (unencrypted) files is not - // allowed when using a decryptor - // - in order to detect files that were not encrypted by mistake. - // However, the default behavior can be overriden by calling this method. - // The caller should use then a different method to ensure encryption - // of files with sensitive data. + /// By default, reading plaintext (unencrypted) files is not + /// allowed when using a decryptor + /// - in order to detect files that were not encrypted by mistake. + /// However, the default behavior can be overriden by calling this method. + /// The caller should use then a different method to ensure encryption + /// of files with sensitive data. Builder* plaintext_files_allowed() { plaintext_files_allowed_ = true; return this; @@ -420,6 +432,36 @@ class PARQUET_EXPORT FileDecryptionProperties { void set_utilized() { utilized_ = true; } + /// FileDecryptionProperties object can be used for reading one file only. + /// (unless this object keeps the keyRetrieval callback only, and no explicit + /// keys or aadPrefix). + /// At the end, keys are wiped out in the memory. + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = "") { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileDecryptionProperties( + footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, + new_aad_prefix, aad_prefix_verifier_, column_properties_map_copy, + plaintext_files_allowed_)); + } + private: std::string footer_key_; std::string aad_prefix_; @@ -452,32 +494,32 @@ class PARQUET_EXPORT FileEncryptionProperties { class Builder { public: explicit Builder(const std::string& footer_key) - : parquet_cipher_(DEFAULT_ENCRYPTION_ALGORITHM), - encrypted_footer_(DEFAULT_ENCRYPTED_FOOTER) { + : parquet_cipher_(kDefaultEncryptionAlgorithm), + encrypted_footer_(kDefaultEncryptedFooter) { footer_key_ = footer_key; store_aad_prefix_in_file_ = false; } - // Create files with plaintext footer. - // If not called, the files will be created with encrypted footer (default). + /// Create files with plaintext footer. + /// If not called, the files will be created with encrypted footer (default). Builder* set_plaintext_footer() { encrypted_footer_ = false; return this; } - // Set encryption algorithm. - // If not called, files will be encrypted with AES_GCM_V1 (default). + /// Set encryption algorithm. + /// If not called, files will be encrypted with AES_GCM_V1 (default). Builder* algorithm(ParquetCipher::type parquet_cipher) { parquet_cipher_ = parquet_cipher; return this; } - // Set a key retrieval metadata (converted from String). - // use either footer_key_metadata or footer_key_id, not both. + /// Set a key retrieval metadata (converted from String). + /// use either footer_key_metadata or footer_key_id, not both. Builder* footer_key_id(const std::string& key_id); - // Set a key retrieval metadata. - // use either footer_key_metadata or footer_key_id, not both. + /// Set a key retrieval metadata. + /// use either footer_key_metadata or footer_key_id, not both. Builder* footer_key_metadata(const std::string& footer_key_metadata) { if (footer_key_metadata.empty()) return this; @@ -486,7 +528,7 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - // Set the file AAD Prefix. + /// Set the file AAD Prefix. Builder* aad_prefix(const std::string& aad_prefix) { if (aad_prefix.empty()) return this; @@ -496,8 +538,8 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - // Skip storing AAD Prefix in file. - // If not called, and if AAD Prefix is set, it will be stored. + /// Skip storing AAD Prefix in file. + /// If not called, and if AAD Prefix is set, it will be stored. Builder* disable_store_aad_prefix_storage() { DCHECK(!aad_prefix_.empty()); @@ -505,9 +547,9 @@ class PARQUET_EXPORT FileEncryptionProperties { return this; } - // Set the list of encrypted columns and their properties (keys etc). - // If not called, all columns will be encrypted with the footer key. - // If called, the file columns not in the list will be left unencrypted. + /// Set the list of encrypted columns and their properties (keys etc). + /// If not called, all columns will be encrypted with the footer key. + /// If called, the file columns not in the list will be left unencrypted. Builder* column_properties( const std::map, std::shared_ptr, @@ -521,7 +563,7 @@ class PARQUET_EXPORT FileEncryptionProperties { std::shared_ptr> element : column_properties) { if (element.second->is_utilized()) { - ParquetException("Column properties utilized in another file"); + throw ParquetException("Column properties utilized in another file"); } element.second->set_utilized(); } @@ -574,13 +616,42 @@ class PARQUET_EXPORT FileEncryptionProperties { } } + /// FileEncryptionProperties object can be used for writing one file only. + /// (at the end, keys are wiped out in the memory). + /// This method allows to clone identical properties for another file, + /// with an option to update the aadPrefix (if newAadPrefix is null, + /// aadPrefix will be cloned too) + std::shared_ptr DeepClone(std::string new_aad_prefix = "") { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileEncryptionProperties( + algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, + new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); + } + private: EncryptionAlgorithm algorithm_; std::string footer_key_; std::string footer_key_metadata_; bool encrypted_footer_; std::string file_aad_; + std::string aad_prefix_; bool utilized_; + bool store_aad_prefix_in_file_; std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index c079f2279b63..3373a826c907 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -237,7 +237,6 @@ class SerializedFile : public ParquetFileReader::Contents { } // no encryption or encryption with plaintext footer - // TODO: encryption with plaintext footer if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { uint32_t metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - @@ -280,22 +279,35 @@ class SerializedFile : public ParquetFileReader::Contents { std::string aad_prefix = file_decryption_properties->aad_prefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, but not stored in file" + "and not supplied in decryption properties"); + } + if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { throw ParquetException( - "ADD Prefix in file and in properties is not the same"); + "AAD Prefix in file and in properties is not the same"); } } aad_prefix = algo.aad.aad_prefix; std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException( - "AAD prefix used for file encryption, but not stored in file" - "and not supplied in decryption properties"); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but not found in file"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD prefix used for file encryption, but not stored in file and not " + "supplied in decryption properties"); + } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; @@ -355,12 +367,18 @@ class SerializedFile : public ParquetFileReader::Contents { EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string aad_prefix = file_decryption_properties->aad_prefix(); + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } if (!algo.aad.aad_prefix.empty()) { if (!aad_prefix.empty()) { if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { throw ParquetException( - "ADD Prefix in file and in properties " + "AAD Prefix in file and in properties " "is not the same"); } } @@ -368,12 +386,18 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { - throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but not found in file"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD prefix used for file encryption, but not stored in file and not " + "supplied in decryption properties"); + } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; file_decryptor_.reset( From 7017089a7faba24a5c766f76c4c13933e96acade Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 3 Jun 2019 21:34:13 +0700 Subject: [PATCH 100/201] fix lint and format issue --- cpp/src/parquet/encryption-metadata-test.cc | 6 +++--- cpp/src/parquet/encryption-test.cc | 14 +++++++------- cpp/src/parquet/file_reader.cc | 2 +- cpp/src/parquet/file_writer.cc | 11 ++++++----- cpp/src/parquet/metadata.cc | 10 ++++++---- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc index f81493dbed05..6f29b364b5b4 100644 --- a/cpp/src/parquet/encryption-metadata-test.cc +++ b/cpp/src/parquet/encryption-metadata-test.cc @@ -26,9 +26,9 @@ namespace parquet { namespace metadata { -const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 -const std::string kColumnEncryptionKey1 = "1234567890123450"; -const std::string kColumnEncryptionKey2 = "1234567890123451"; +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; TEST(Metadata, EncryptFooter) { parquet::schema::NodeVector fields; diff --git a/cpp/src/parquet/encryption-test.cc b/cpp/src/parquet/encryption-test.cc index e4d4a27ed7f4..088a2c08bf88 100644 --- a/cpp/src/parquet/encryption-test.cc +++ b/cpp/src/parquet/encryption-test.cc @@ -27,10 +27,10 @@ using schema::ColumnPath; namespace test { -const std::string kFooterEncryptionKey = "0123456789012345"; // 128bit/16 -const std::string kColumnEncryptionKey1 = "1234567890123450"; -const std::string kColumnEncryptionKey2 = "1234567890123451"; -const std::string kFileName = "tester"; +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; TEST(TestColumnEncryptionProperties, ColumnEncryptedWithOwnKey) { std::shared_ptr column_path_1 = @@ -66,7 +66,7 @@ TEST(TestEncryptionProperties, UniformEncryption) { std::shared_ptr props = builder.build(); ASSERT_EQ(true, props->encrypted_footer()); - ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); ASSERT_EQ("kf", props->footer_key_metadata()); @@ -106,7 +106,7 @@ TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { std::shared_ptr props = builder.build(); ASSERT_EQ(true, props->encrypted_footer()); - ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); std::shared_ptr out_col_props_1 = @@ -163,7 +163,7 @@ TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { std::shared_ptr props = builder.build(); ASSERT_EQ(false, props->encrypted_footer()); - ASSERT_EQ(DEFAULT_ENCRYPTION_ALGORITHM, props->algorithm().algorithm); + ASSERT_EQ(kDefaultEncryptionAlgorithm, props->algorithm().algorithm); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); std::shared_ptr out_col_props_1 = diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 3373a826c907..fe2563dffedb 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -279,7 +279,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::string aad_prefix = file_decryption_properties->aad_prefix(); EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { throw ParquetException( "AAD prefix used for file encryption, but not stored in file" "and not supplied in decryption properties"); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index b6477c0d844a..a5cd76e5d749 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -306,8 +306,9 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Tell(&position)); uint32_t footer_and_crypto_len = static_cast(position - metadata_start); - sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4); - sink_->Write(kParquetEMagic, 4); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { // Encrypted file with plaintext footer file_metadata_ = metadata_->Finish(); auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); @@ -319,7 +320,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } } - sink_->Close(); + PARQUET_THROW_NOT_OK(sink_->Close()); } } @@ -460,8 +461,8 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin PARQUET_THROW_NOT_OK(sink->Tell(&position)); metadata_len = static_cast(position) - metadata_len; - sink->Write(reinterpret_cast(&metadata_len), 4); - sink->Write(kParquetMagic, 4); + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); } } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 71d9dc9434c1..394950ef4be6 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -522,11 +522,13 @@ class FileMetaData::FileMetaDataImpl { encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data()); // write unencrypted footer - dst->Write(serialized_data, serialized_len); + PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len)); // Write signature (nonce and tag) - dst->Write(encrypted_data.data() + 4, encryption::kNonceLength); - dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, - encryption::kGcmTagLength); + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + 4, encryption::kNonceLength)); + PARQUET_THROW_NOT_OK( + dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, + encryption::kGcmTagLength)); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor, false); From a3d924c700f64e316b3cf4adc8ce20802e8d201c Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 4 Jun 2019 08:10:22 +0700 Subject: [PATCH 101/201] fix metadata set, statistics set issues --- cpp/src/parquet/encryption-metadata-test.cc | 422 +++++++++++++++++++- cpp/src/parquet/metadata.cc | 208 +++++----- cpp/src/parquet/metadata.h | 1 + 3 files changed, 534 insertions(+), 97 deletions(-) diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc index 6f29b364b5b4..7ff31f9b2122 100644 --- a/cpp/src/parquet/encryption-metadata-test.cc +++ b/cpp/src/parquet/encryption-metadata-test.cc @@ -21,6 +21,7 @@ #include "parquet/properties.h" #include "parquet/schema.h" +#include "parquet/statistics.h" namespace parquet { @@ -28,9 +29,9 @@ namespace metadata { const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 const char kColumnEncryptionKey1[] = "1234567890123450"; -const char kColumnEncryptionKey2[] = "1234567890123451"; +// const char kColumnEncryptionKey2[] = "1234567890123451"; -TEST(Metadata, EncryptFooter) { +TEST(Metadata, UniformEncryption) { parquet::schema::NodeVector fields; parquet::schema::NodePtr root; parquet::SchemaDescriptor schema; @@ -40,19 +41,294 @@ TEST(Metadata, EncryptFooter) { root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); schema.Init(root); + int64_t nrows = 1000; + int32_t int_min = 100, int_max = 200; + EncodedStatistics stats_int; + stats_int.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&int_min), 4)) + .set_max(std::string(reinterpret_cast(&int_max), 4)); + EncodedStatistics stats_float; + float float_min = 100.100f, float_max = 200.200f; + stats_float.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&float_min), 4)) + .set_max(std::string(reinterpret_cast(&float_max), 4)); + + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); + encryption_prop_builder.footer_key_metadata("kf"); + + WriterProperties::Builder writer_prop_builder; + writer_prop_builder.version(ParquetVersion::PARQUET_2_0); + writer_prop_builder.encryption(encryption_prop_builder.build()); + auto props = writer_prop_builder.build(); + + auto f_builder = FileMetaDataBuilder::Make(&schema, props); + auto rg1_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + stats_int.set_is_signed(true); + col1_builder->SetStatistics(stats_int); + stats_float.set_is_signed(true); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); + + rg1_builder->set_num_rows(nrows / 2); + rg1_builder->Finish(1024); + + // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); + + rg2_builder->set_num_rows(nrows / 2); + rg2_builder->Finish(1024); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); + + auto file_crypto_metadata = f_builder->GetCryptoMetaData(); + ASSERT_EQ(true, file_crypto_metadata != NULLPTR); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_LE(0, static_cast(f_accessor->size())); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg1_column1->is_stats_set()); + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(0, rg1_column1->statistics()->null_count()); + ASSERT_EQ(0, rg1_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); + ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column1->num_values()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(3, rg1_column1->encodings().size()); + ASSERT_EQ(3, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column1->total_compressed_size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(10, rg1_column1->data_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg2_column1->is_stats_set()); + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(0, rg2_column1->statistics()->null_count()); + ASSERT_EQ(0, rg2_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); + ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(nrows / 2, rg2_column1->num_values()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(3, rg2_column1->encodings().size()); + ASSERT_EQ(3, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column1->total_compressed_size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(10, rg2_column1->data_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); +} + +TEST(Metadata, EncryptFooterAndOneColumn) { + parquet::schema::NodeVector fields; + parquet::schema::NodePtr root; + parquet::SchemaDescriptor schema; + + fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); + fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); + root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); + schema.Init(root); + + int64_t nrows = 1000; + int32_t int_min = 100, int_max = 200; + EncodedStatistics stats_int; + stats_int.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&int_min), 4)) + .set_max(std::string(reinterpret_cast(&int_max), 4)); + EncodedStatistics stats_float; + float float_min = 100.100f, float_max = 200.200f; + stats_float.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&float_min), 4)) + .set_max(std::string(reinterpret_cast(&float_max), 4)); + + std::shared_ptr int_col_path = + parquet::schema::ColumnPath::FromDotString("int_col"); + ColumnEncryptionProperties::Builder int_col_builder(int_col_path); + int_col_builder.key(kColumnEncryptionKey1); + int_col_builder.key_id("kc1"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + encryption_col_props; + encryption_col_props[int_col_path] = int_col_builder.build(); + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); encryption_prop_builder.footer_key_metadata("kf"); + encryption_prop_builder.column_properties(encryption_col_props); WriterProperties::Builder writer_prop_builder; + writer_prop_builder.version(ParquetVersion::PARQUET_2_0); writer_prop_builder.encryption(encryption_prop_builder.build()); auto props = writer_prop_builder.build(); auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto file_metadata = f_builder->Finish(); - ASSERT_EQ(false, file_metadata->is_encryption_algorithm_set()); + auto rg1_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + stats_int.set_is_signed(true); + col1_builder->SetStatistics(stats_int); + stats_float.set_is_signed(true); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); + + rg1_builder->set_num_rows(nrows / 2); + rg1_builder->Finish(1024); + + // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); + + rg2_builder->set_num_rows(nrows / 2); + rg2_builder->Finish(1024); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); auto file_crypto_metadata = f_builder->GetCryptoMetaData(); ASSERT_EQ(true, file_crypto_metadata != NULLPTR); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_LE(0, static_cast(f_accessor->size())); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(false, rg1_column1->is_metadata_set()); + ASSERT_THROW(rg1_column1->is_stats_set(), ParquetException); + ASSERT_THROW(rg1_column1->statistics(), ParquetException); + ASSERT_THROW(rg1_column1->compression(), ParquetException); + ASSERT_THROW(rg1_column1->num_values(), ParquetException); + ASSERT_THROW(rg1_column1->encodings(), ParquetException); + ASSERT_THROW(rg1_column1->total_compressed_size(), ParquetException); + ASSERT_THROW(rg1_column1->total_uncompressed_size(), ParquetException); + ASSERT_THROW(rg1_column1->dictionary_page_offset(), ParquetException); + ASSERT_THROW(rg1_column1->data_page_offset(), ParquetException); + + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg1_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(3, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(false, rg1_column1->is_metadata_set()); + ASSERT_THROW(rg2_column1->is_stats_set(), ParquetException); + ASSERT_THROW(rg2_column1->statistics(), ParquetException); + ASSERT_THROW(rg2_column1->compression(), ParquetException); + ASSERT_THROW(rg2_column1->num_values(), ParquetException); + ASSERT_THROW(rg2_column1->encodings(), ParquetException); + ASSERT_THROW(rg2_column1->total_compressed_size(), ParquetException); + ASSERT_THROW(rg2_column1->total_uncompressed_size(), ParquetException); + ASSERT_THROW(rg2_column1->dictionary_page_offset(), ParquetException); + ASSERT_THROW(rg2_column1->data_page_offset(), ParquetException); + + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg2_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(3, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); } TEST(Metadata, PlaintextFooter) { @@ -65,20 +341,154 @@ TEST(Metadata, PlaintextFooter) { root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); schema.Init(root); + int64_t nrows = 1000; + int32_t int_min = 100, int_max = 200; + EncodedStatistics stats_int; + stats_int.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&int_min), 4)) + .set_max(std::string(reinterpret_cast(&int_max), 4)); + EncodedStatistics stats_float; + float float_min = 100.100f, float_max = 200.200f; + stats_float.set_null_count(0) + .set_distinct_count(nrows) + .set_min(std::string(reinterpret_cast(&float_min), 4)) + .set_max(std::string(reinterpret_cast(&float_max), 4)); + + std::shared_ptr int_col_path = + parquet::schema::ColumnPath::FromDotString("int_col"); + ColumnEncryptionProperties::Builder int_col_builder(int_col_path); + int_col_builder.key(kColumnEncryptionKey1); + int_col_builder.key_id("kc1"); + + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + encryption_col_props; + encryption_col_props[int_col_path] = int_col_builder.build(); + FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); encryption_prop_builder.footer_key_metadata("kf"); encryption_prop_builder.set_plaintext_footer(); + encryption_prop_builder.column_properties(encryption_col_props); WriterProperties::Builder writer_prop_builder; + writer_prop_builder.version(ParquetVersion::PARQUET_2_0); writer_prop_builder.encryption(encryption_prop_builder.build()); auto props = writer_prop_builder.build(); auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto file_metadata = f_builder->Finish(); - ASSERT_EQ(true, file_metadata->is_encryption_algorithm_set()); + auto rg1_builder = f_builder->AppendRowGroup(); + + // Write the metadata + // rowgroup1 metadata + auto col1_builder = rg1_builder->NextColumnChunk(); + auto col2_builder = rg1_builder->NextColumnChunk(); + // column metadata + stats_int.set_is_signed(true); + col1_builder->SetStatistics(stats_int); + stats_float.set_is_signed(true); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); + + rg1_builder->set_num_rows(nrows / 2); + rg1_builder->Finish(1024); + + // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); + col1_builder = rg2_builder->NextColumnChunk(); + col2_builder = rg2_builder->NextColumnChunk(); + // column metadata + col1_builder->SetStatistics(stats_int); + col2_builder->SetStatistics(stats_float); + col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); + col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); + + rg2_builder->set_num_rows(nrows / 2); + rg2_builder->Finish(1024); + + // Read the metadata + auto f_accessor = f_builder->Finish(); + + ASSERT_EQ(true, f_accessor->is_encryption_algorithm_set()); auto file_crypto_metadata = f_builder->GetCryptoMetaData(); ASSERT_EQ(NULLPTR, file_crypto_metadata); + + // file metadata + ASSERT_EQ(nrows, f_accessor->num_rows()); + ASSERT_LE(0, static_cast(f_accessor->size())); + ASSERT_EQ(2, f_accessor->num_row_groups()); + ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); + ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); + ASSERT_EQ(3, f_accessor->num_schema_elements()); + + // row group1 metadata + auto rg1_accessor = f_accessor->RowGroup(0); + ASSERT_EQ(2, rg1_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); + ASSERT_EQ(1024, rg1_accessor->total_byte_size()); + + auto rg1_column1 = rg1_accessor->ColumnChunk(0); + auto rg1_column2 = rg1_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg1_column1->is_metadata_set()); + ASSERT_EQ(false, rg1_column1->is_stats_set()); + ASSERT_EQ(NULLPTR, rg1_column1->statistics()); + // ASSERT_THROW(rg1_column1->encodings(), ParquetException); + + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); + ASSERT_EQ(nrows / 2, rg1_column1->num_values()); + ASSERT_EQ(3, rg1_column1->encodings().size()); + ASSERT_EQ(512, rg1_column1->total_compressed_size()); + ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); + ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); + ASSERT_EQ(10, rg1_column1->data_page_offset()); + + ASSERT_EQ(true, rg1_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg1_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); + ASSERT_EQ(nrows / 2, rg1_column2->num_values()); + ASSERT_EQ(3, rg1_column2->encodings().size()); + ASSERT_EQ(512, rg1_column2->total_compressed_size()); + ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); + ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); + ASSERT_EQ(30, rg1_column2->data_page_offset()); + + auto rg2_accessor = f_accessor->RowGroup(1); + ASSERT_EQ(2, rg2_accessor->num_columns()); + ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); + ASSERT_EQ(1024, rg2_accessor->total_byte_size()); + + auto rg2_column1 = rg2_accessor->ColumnChunk(0); + auto rg2_column2 = rg2_accessor->ColumnChunk(1); + ASSERT_EQ(true, rg2_column1->is_metadata_set()); + ASSERT_EQ(false, rg2_column1->is_stats_set()); + ASSERT_EQ(NULLPTR, rg2_column1->statistics()); + // ASSERT_THROW(rg2_column1->encodings(), ParquetException); + + ASSERT_EQ(nrows / 2, rg2_column1->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); + ASSERT_EQ(3, rg2_column1->encodings().size()); + ASSERT_EQ(512, rg2_column1->total_compressed_size()); + ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); + ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); + ASSERT_EQ(10, rg2_column1->data_page_offset()); + + ASSERT_EQ(true, rg2_column2->is_stats_set()); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(0, rg2_column2->statistics()->null_count()); + ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(nrows / 2, rg2_column2->num_values()); + ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); + ASSERT_EQ(3, rg2_column2->encodings().size()); + ASSERT_EQ(512, rg2_column2->total_compressed_size()); + ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); + ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); + ASSERT_EQ(26, rg2_column2->data_page_offset()); } } // namespace metadata diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 394950ef4be6..ec7524de4a43 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -180,37 +180,36 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { + is_metadata_set_ = column->__isset.meta_data; metadata_ = column->meta_data; - if (column->__isset.crypto_metadata) { + if (column->__isset.crypto_metadata && !is_metadata_set_) { format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { - if (file_decryptor->properties() == NULLPTR) { - throw ParquetException( - "Cannot decrypt ColumnMetadata. " - "FileDecryptionProperties must be provided."); + if (file_decryptor != NULLPTR && file_decryptor->properties() != NULLPTR) { + // should decrypt metadata + std::shared_ptr path = std::make_shared( + ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); + std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; + + std::string aad_column_metadata = encryption::CreateModuleAad( + file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, + column_ordinal, (int16_t)-1); + auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, + aad_column_metadata); + uint32_t len = static_cast(column->encrypted_column_metadata.size()); + DeserializeThriftMsg( + reinterpret_cast(column->encrypted_column_metadata.c_str()), + &len, &metadata_, decryptor, false); + is_metadata_set_ = true; } - // should decrypt metadata - std::shared_ptr path = std::make_shared( - ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema); - std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata; - - DCHECK(file_decryptor != NULLPTR); - - std::string aad_column_metadata = encryption::CreateModuleAad( - file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal, - column_ordinal, (int16_t)-1); - auto decryptor = file_decryptor->GetColumnMetaDecryptor(path, key_metadata, - aad_column_metadata); - uint32_t len = static_cast(column->encrypted_column_metadata.size()); - DeserializeThriftMsg( - reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, decryptor, false); } } - for (auto encoding : metadata_.encodings) { - encodings_.push_back(FromThrift(encoding)); + if (is_metadata_set_) { + for (auto encoding : metadata_.encodings) { + encodings_.push_back(FromThrift(encoding)); + } } possible_stats_ = nullptr; } @@ -219,12 +218,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline const std::string& file_path() const { return column_->file_path; } // column metadata - inline Type::type type() const { return FromThrift(metadata_.type); } + inline bool is_metadata_set() const { return is_metadata_set_; } + inline Type::type type() const { return FromThrift(GetMetadataIfSet().type); } - inline int64_t num_values() const { return metadata_.num_values; } + inline int64_t num_values() const { return GetMetadataIfSet().num_values; } std::shared_ptr path_in_schema() { - return std::make_shared(metadata_.path_in_schema); + return std::make_shared(GetMetadataIfSet().path_in_schema); } // Check if statistics are set and are valid @@ -234,11 +234,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - if (!metadata_.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { + auto metadata = GetMetadataIfSet(); + if (!metadata.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(metadata_, descr_); + possible_stats_ = MakeColumnStats(metadata, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -249,28 +250,39 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return is_stats_set() ? possible_stats_ : nullptr; } - inline Compression::type compression() const { return FromThrift(metadata_.codec); } + inline Compression::type compression() const { + return FromThrift(GetMetadataIfSet().codec); + } - const std::vector& encodings() const { return encodings_; } + const std::vector& encodings() const { + GetMetadataIfSet(); + return encodings_; + } inline bool has_dictionary_page() const { - return metadata_.__isset.dictionary_page_offset; + return GetMetadataIfSet().__isset.dictionary_page_offset; } inline int64_t dictionary_page_offset() const { - return metadata_.dictionary_page_offset; + return GetMetadataIfSet().dictionary_page_offset; } - inline int64_t data_page_offset() const { return metadata_.data_page_offset; } + inline int64_t data_page_offset() const { return GetMetadataIfSet().data_page_offset; } - inline bool has_index_page() const { return metadata_.__isset.index_page_offset; } + inline bool has_index_page() const { + return GetMetadataIfSet().__isset.index_page_offset; + } - inline int64_t index_page_offset() const { return metadata_.index_page_offset; } + inline int64_t index_page_offset() const { + return GetMetadataIfSet().index_page_offset; + } - inline int64_t total_compressed_size() const { return metadata_.total_compressed_size; } + inline int64_t total_compressed_size() const { + return GetMetadataIfSet().total_compressed_size; + } inline int64_t total_uncompressed_size() const { - return metadata_.total_uncompressed_size; + return GetMetadataIfSet().total_uncompressed_size; } inline std::unique_ptr crypto_metadata() const { @@ -289,6 +301,16 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { format::ColumnMetaData metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; + bool is_metadata_set_; + + inline const format::ColumnMetaData& GetMetadataIfSet() const { + if (!is_metadata_set_) { + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); + } + return metadata_; + } }; std::unique_ptr ColumnChunkMetaData::Make( @@ -316,6 +338,8 @@ int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); } // column metadata +bool ColumnChunkMetaData::is_metadata_set() const { return impl_->is_metadata_set(); } + Type::type ColumnChunkMetaData::type() const { return impl_->type(); } int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); } @@ -925,27 +949,11 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_metadata_.__set_encodings(thrift_encodings); - // temporary fix: setting for columnchunk meta_data in case file is not encrypted - if (properties_->file_encryption_properties() == NULLPTR) { + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); - } - } - - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - ThriftSerializer serializer; - - // column is unencrypted - if (encryptor == NULLPTR) { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); - - serializer.Serialize(column_chunk_, sink); - } else { // column is encrypted - const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); - bool encrypt_metadata = encryptor->encryptColumnMetaData( - properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); + } else { column_chunk_->__isset.crypto_metadata = true; format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { @@ -961,10 +969,62 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->__set_crypto_metadata(ccmd); + bool encrypted_footer = + properties_->file_encryption_properties()->encrypted_footer(); + bool encrypt_metadata = + !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); if (!encrypt_metadata) { column_chunk_->__isset.meta_data = true; column_chunk_->__set_meta_data(column_metadata_); - } else { // Serialize and encrypt ColumnMetadata separately + } else if (!encrypted_footer) { + // Keep redacted metadata version for old readers + format::ColumnMetaData metadata_redacted; + metadata_redacted.__set_type(column_metadata_.type); + metadata_redacted.__set_encodings(column_metadata_.encodings); + metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); + metadata_redacted.__set_codec(column_metadata_.codec); + metadata_redacted.__set_num_values(column_metadata_.num_values); + metadata_redacted.__set_total_uncompressed_size( + column_metadata_.total_uncompressed_size); + metadata_redacted.__set_total_compressed_size( + column_metadata_.total_compressed_size); + if (column_metadata_.__isset.key_value_metadata) { + metadata_redacted.__isset.key_value_metadata = true; + metadata_redacted.__set_key_value_metadata(column_metadata_.key_value_metadata); + } + metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); + if (column_metadata_.__isset.index_page_offset) { + metadata_redacted.__isset.index_page_offset = true; + metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); + } + if (column_metadata_.__isset.dictionary_page_offset) { + metadata_redacted.__isset.dictionary_page_offset = true; + metadata_redacted.__set_dictionary_page_offset( + column_metadata_.dictionary_page_offset); + } + metadata_redacted.__isset.statistics = false; + metadata_redacted.__isset.encoding_stats = false; + + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(metadata_redacted); + } + } + } + + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor) { + ThriftSerializer serializer; + + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + // column is unencrypted + if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { + serializer.Serialize(column_chunk_, sink); + } else { // column is encrypted + bool encrypt_metadata = encryptor->encryptColumnMetaData( + properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); + + if (encrypt_metadata) { + // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata uint8_t* serialized_data; @@ -982,40 +1042,6 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); - // Keep redacted metadata version for old readers - if (!properties_->file_encryption_properties()->encrypted_footer()) { - // metadata_redacted should be stripped of the column_metadata_ statistics. - format::ColumnMetaData metadata_redacted; - metadata_redacted.__set_type(column_metadata_.type); - metadata_redacted.__set_encodings(column_metadata_.encodings); - metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); - metadata_redacted.__set_codec(column_metadata_.codec); - metadata_redacted.__set_num_values(column_metadata_.num_values); - metadata_redacted.__set_total_uncompressed_size( - column_metadata_.total_uncompressed_size); - metadata_redacted.__set_total_compressed_size( - column_metadata_.total_compressed_size); - if (column_metadata_.__isset.key_value_metadata) { - metadata_redacted.__isset.key_value_metadata = true; - metadata_redacted.__set_key_value_metadata( - column_metadata_.key_value_metadata); - } - metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); - if (column_metadata_.__isset.index_page_offset) { - metadata_redacted.__isset.index_page_offset = true; - metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); - } - if (column_metadata_.__isset.dictionary_page_offset) { - metadata_redacted.__isset.dictionary_page_offset = true; - metadata_redacted.__set_dictionary_page_offset( - column_metadata_.dictionary_page_offset); - } - metadata_redacted.__isset.statistics = false; - metadata_redacted.__isset.encoding_stats = false; - - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(metadata_redacted); - } } serializer.Serialize(column_chunk_, sink); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 5d3695a32527..aaeaa322b5bb 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -138,6 +138,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const std::string& file_path() const; // column metadata + bool is_metadata_set() const; Type::type type() const; int64_t num_values() const; std::shared_ptr path_in_schema() const; From daeb600fe84dfe8189b1fc6d87fd57974253f1ca Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 5 Jun 2019 08:44:09 +0300 Subject: [PATCH 102/201] Various changes to encryption-reader-writer-all-crypto-options test and to AAD handling in file_reader.cc --- ...yption-reader-writer-all-crypto-options.cc | 244 ++++-------------- ...ryption-reader-writer-all-crypto-options.h | 54 ++++ cpp/src/parquet/file_reader.cc | 60 +++-- 3 files changed, 133 insertions(+), 225 deletions(-) create mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 98f0e57a7aac..5241e559caea 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -23,7 +23,7 @@ #include #include -#include +#include /* * This file contains samples for writing and reading encrypted Parquet files in different @@ -52,21 +52,25 @@ * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * The write sample creates files with eight columns in the following + * The write sample creates files with four columns in the following * encryption configurations: * * - Encryption configuration 1: Encrypt all columns and the footer with the same key. * (uniform encryption) - * - Encryption configuration 2: Encrypt two columns and the footer. - * - Encryption configuration 3: Encrypt two columns. Don’t encrypt footer (to enable - * legacy readers) - plaintext footer mode. - * - Encryption configuration 4: Encrypt two columns and the footer. Supply aad_prefix - * for file identity verification. - * - Encryption configuration 5: Encrypt two columns and the footer. Supply aad_prefix, - * and call disable_aad_prefix_storage to prevent file + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file * identity storage in file metadata. - * - Encryption configuration 6: Encrypt two columns and the footer. Use the - * alternative (AES_GCM_CTR_V1) algorithm. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. * * The read sample uses each of the following decryption configurations to read every * encrypted files in the input directory: @@ -88,8 +92,11 @@ const std::string kColumnEncryptionKey2 = "1234567890123451"; const std::string fileName = "tester"; void PrintDecryptionConfiguration(int configuration); +// Check that the decryption result is as expected. void CheckResult(std::string file, int example_id, std::string exception_msg); -int ExtractEncryptionConfigurationNumber(std::string file); +// Returns true if FileName ends with suffix. Otherwise returns false. +// Used to skip unencrypted parquet files. +bool FileNameEndsWith(std::string file_name, std::string suffix); std::vector GetDirectoryFiles(const std::string& path) { std::vector files; @@ -123,7 +130,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { vector_of_encryption_configurations.push_back( file_encryption_builder_1.footer_key_metadata("kf")->build()); - // Encryption configuration 2: Encrypt two columns and the footer. + // Encryption configuration 2: Encrypt two columns and the footer, with different keys. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -148,7 +155,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->column_properties(encryption_cols2) ->build()); - // Encryption configuration 3: Encrypt two columns, don’t encrypt footer. + // Encryption configuration 3: Encrypt two columns, with different keys. + // Don’t encrypt footer. // (plaintext footer mode, readable by legacy readers) std::map, std::shared_ptr, @@ -170,7 +178,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->set_plaintext_footer() ->build()); - // Encryption configuration 4: Encrypt two columns and the footer. Use aad_prefix. + // Encryption configuration 4: Encrypt two columns and the footer, with different keys. + // Use aad_prefix. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -191,8 +200,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->aad_prefix(fileName) ->build()); - // Encryption configuration 5: Encrypt two columns and the footer. Use aad_prefix and - // disable_aad_prefix_storage. + // Encryption configuration 5: Encrypt two columns and the footer, with different keys. + // Use aad_prefix and disable_aad_prefix_storage. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -214,8 +223,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { ->disable_store_aad_prefix_storage() ->build()); - // Encryption configuration 6: Encrypt two columns and the footer. Use AES_GCM_CTR_V1 - // algorithm. + // Encryption configuration 6: Encrypt two columns and the footer, with different keys. + // Use AES_GCM_CTR_V1 algorithm. std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -290,31 +299,6 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { int32_writer->WriteBatch(1, nullptr, nullptr, &value); } - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Write the Float column parquet::FloatWriter* float_writer = static_cast(rg_writer->NextColumn()); @@ -330,38 +314,6 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { double value = i * 1.1111111; double_writer->WriteBatch(1, nullptr, nullptr, &value); } - - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::ByteArray value; - char hello[FIXED_LENGTH] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = FIXED_LENGTH; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } - - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(rg_writer->NextColumn()); - for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); - - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } // Close the ParquetFileWriter file_writer->Close(); @@ -387,7 +339,6 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { // Decryption configuration 1: Decrypt using key retriever callback that holds the keys // of two encrypted columns and the footer key. - std::shared_ptr string_kr1 = std::make_shared(); string_kr1->PutKey("kf", kFooterEncryptionKey); @@ -449,8 +400,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { PrintDecryptionConfiguration(example_id + 1); for (auto const& file : files_in_directory) { std::string exception_msg = ""; - if (file.find("parquet.encrypted") == - std::string::npos) // Skip non encrypted files + if (!FileNameEndsWith(file, "parquet.encrypted")) // Skip non encrypted files continue; try { std::cout << "--> Read file " << file << std::endl; @@ -476,7 +426,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { // Get the number of Columns int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); + assert(num_columns == 4); // Iterate over all the RowGroups in the file for (int r = 0; r < num_row_groups; ++r) { @@ -486,8 +436,6 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { int64_t values_read = 0; int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; int i; std::shared_ptr column_reader; @@ -534,63 +482,8 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { i++; } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - assert(value == expected_value); - if ((i % 2) == 0) { - assert(repetition_level == 1); - } else { - assert(repetition_level == 0); - } - i++; - } - - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); - } - i++; - } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); + column_reader = row_group_reader->Column(2); parquet::FloatReader* float_reader = static_cast(column_reader.get()); // Read all the rows in the column @@ -612,7 +505,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { } // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); + column_reader = row_group_reader->Column(3); parquet::DoubleReader* double_reader = static_cast(column_reader.get()); // Read all the rows in the column @@ -632,61 +525,6 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { assert(value == expected_value); i++; } - - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // Verify the value written - char expected_value[FIXED_LENGTH] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - assert(values_read == 1); - assert(value.len == FIXED_LENGTH); - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - assert(definition_level == 1); - } else { - // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); - } - i++; - } - - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - assert(rows_read == 1); - // There are no NULL values in the rows written - assert(values_read == 1); - // Verify the value written - char v = static_cast(i); - char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v}; - assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0); - i++; - } } } catch (const std::exception& e) { exception_msg = e.what(); @@ -716,7 +554,8 @@ void PrintDecryptionConfiguration(int configuration) { std::cout << std::endl; } -int ExtractEncryptionConfigurationNumber(std::string file) { +// Check that the decryption result is as expected. +void CheckResult(std::string file, int example_id, std::string exception_msg) { int encryption_configuration_number; std::regex r("tester([0-9]+)\\.parquet.encrypted"); std::smatch m; @@ -732,11 +571,6 @@ int ExtractEncryptionConfigurationNumber(std::string file) { std::cerr << "Error: Unknown encryption configuration number. " << std::endl; } - return encryption_configuration_number; -} - -void CheckResult(std::string file, int example_id, std::string exception_msg) { - int encryption_configuration_number = ExtractEncryptionConfigurationNumber(file); int decryption_configuration_number = example_id + 1; // Encryption_configuration number five contains aad_prefix and @@ -765,6 +599,16 @@ void CheckResult(std::string file, int example_id, std::string exception_msg) { std::cout << "Error: Unexpected exception was thrown." << exception_msg; } +bool FileNameEndsWith(std::string file_name, std::string suffix) { + std::string::size_type idx = file_name.find_first_of('.'); + + if (idx != std::string::npos) { + std::string extension = file_name.substr(idx + 1); + if (extension.compare(suffix) == 0) return true; + } + return false; +} + int main(int argc, char** argv) { enum Operation { write, read }; std::string rootPath; diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h new file mode 100644 index 000000000000..db1b692b9f08 --- /dev/null +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +using parquet::LogicalType; +using parquet::Repetition; +using parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; + +constexpr int FIXED_LENGTH = 10; + +static std::shared_ptr SetupSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, LogicalType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + LogicalType::TIME_MILLIS)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, + LogicalType::NONE)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index fe2563dffedb..6d27e2dc0035 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -276,41 +276,46 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("No decryption properties are provided"); } - std::string aad_prefix = file_decryption_properties->aad_prefix(); - + // Handle AAD prefix EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properites; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { throw ParquetException( - "AAD prefix used for file encryption, but not stored in file" - "and not supplied in decryption properties"); + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); } - if (!algo.aad.aad_prefix.empty()) { - if (!aad_prefix.empty()) { - if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + if (file_has_aad_prefix) { + if (!aad_prefix_in_properites.empty()) { + if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { throw ParquetException( - "AAD Prefix in file and in properties is not the same"); + "AAD Prefix in file and in properties " + "is not the same"); } } - aad_prefix = algo.aad.aad_prefix; + aad_prefix = aad_prefix_in_file; std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { throw ParquetException( - "AAD Prefix set in decryption properties, but not found in file"); + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); } std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) { throw ParquetException( - "AAD prefix used for file encryption, but not stored in file and not " - "supplied in decryption properties"); + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata())); @@ -366,37 +371,42 @@ class SerializedFile : public ParquetFileReader::Contents { FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); - std::string aad_prefix = file_decryption_properties->aad_prefix(); - if (algo.aad.supply_aad_prefix && aad_prefix.empty()) { + // Handle AAD prefix + std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properites; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { throw ParquetException( "AAD prefix used for file encryption, " "but not stored in file and not supplied " "in decryption properties"); } - if (!algo.aad.aad_prefix.empty()) { - if (!aad_prefix.empty()) { - if (aad_prefix.compare(algo.aad.aad_prefix) != 0) { + if (file_has_aad_prefix) { + if (!aad_prefix_in_properites.empty()) { + if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { throw ParquetException( "AAD Prefix in file and in properties " "is not the same"); } } - aad_prefix = algo.aad.aad_prefix; + aad_prefix = aad_prefix_in_file; std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix.empty()) { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { throw ParquetException( - "AAD Prefix set in decryption properties, but not found in file"); + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); } std::shared_ptr aad_prefix_verifier = file_decryption_properties->aad_prefix_verifier(); if (aad_prefix_verifier != NULLPTR) { throw ParquetException( - "AAD prefix used for file encryption, but not stored in file and not " - "supplied in decryption properties"); + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); } } std::string file_aad = aad_prefix + algo.aad.aad_file_unique; From 980a8f513d0f5aee0056068aaeb0ddd5d19914ac Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 5 Jun 2019 15:06:05 +0300 Subject: [PATCH 103/201] Fix logging error --- cpp/src/parquet/encryption.cc | 223 ++++++++++++++++++++++++++++++++++ cpp/src/parquet/encryption.h | 208 +++---------------------------- 2 files changed, 243 insertions(+), 188 deletions(-) diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 8a88db002240..096df59b1353 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -22,6 +22,7 @@ #include #include +#include "arrow/util/logging.h" #include "arrow/util/utf8.h" namespace parquet { @@ -47,6 +48,23 @@ const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { return key_map_[key_id]; } +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key( + std::string column_key) { + if (column_key.empty()) return this; + + DCHECK(key_.empty()); + key_ = column_key; + return this; +} + +ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata( + const std::string& key_metadata) { + DCHECK(!key_metadata.empty()); + DCHECK(key_metadata_.empty()); + key_metadata_ = key_metadata; + return this; +} + ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id( const std::string& key_id) { // key_id is expected to be in UTF8 encoding @@ -61,6 +79,211 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id return this; } +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + + column_properties_ = column_properties; + return this; +} + +void FileDecryptionProperties::wipeout_decryption_keys() { + if (!footer_key_.empty()) + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_decryption_key(); + } +} + +bool FileDecryptionProperties::is_utilized() { + if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) + return false; + + return utilized_; +} + +std::shared_ptr FileDecryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileDecryptionProperties( + footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix, + aad_prefix_verifier_, column_properties_map_copy, plaintext_files_allowed_)); +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key( + const std::string footer_key) { + if (footer_key.empty()) { + return this; + } + DCHECK(footer_key_.empty()); + footer_key_ = footer_key; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever( + const std::shared_ptr& key_retriever) { + if (key_retriever == NULLPTR) return this; + + DCHECK(key_retriever_ == NULLPTR); + key_retriever_ = key_retriever; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) { + return this; + } + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + return this; +} + +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier( + std::shared_ptr aad_prefix_verifier) { + if (aad_prefix_verifier == NULLPTR) return this; + + DCHECK(aad_prefix_verifier_ == NULLPTR); + aad_prefix_verifier_ = aad_prefix_verifier; + return this; +} + +ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key( + const std::string& key) { + if (key.empty()) return this; + + DCHECK(!key.empty()); + key_ = key; + return this; +} + +std::shared_ptr ColumnDecryptionProperties::Builder::build() { + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_)); +} + +void ColumnDecryptionProperties::wipeout_decryption_key() { + if (!key_.empty()) { + std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); + } +} + +std::shared_ptr ColumnDecryptionProperties::DeepClone() { + std::string key_copy = key_; + return std::shared_ptr( + new ColumnDecryptionProperties(column_path_, key_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata( + const std::string& footer_key_metadata) { + if (footer_key_metadata.empty()) return this; + + DCHECK(footer_key_metadata_.empty()); + footer_key_metadata_ = footer_key_metadata; + return this; +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_properties( + const std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>& column_properties) { + if (column_properties.size() == 0) return this; + + if (column_properties_.size() != 0) + throw ParquetException("Column properties already set"); + + for (std::pair, + std::shared_ptr> + element : column_properties) { + if (element.second->is_utilized()) { + throw ParquetException("Column properties utilized in another file"); + } + element.second->set_utilized(); + } + column_properties_ = column_properties; + return this; +} + +void FileEncryptionProperties::wipeout_encryption_keys() { + std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + for (std::pair, + std::shared_ptr> + element : column_properties_) { + element.second->wipeout_encryption_key(); + } +} + +std::shared_ptr FileEncryptionProperties::DeepClone( + std::string new_aad_prefix) { + std::string footer_key_copy = footer_key_; + std::map, + std::shared_ptr, schema::ColumnPath::CmpColumnPath> + column_properties_map_copy; + + for (std::pair, + std::shared_ptr> + element : column_properties_) { + column_properties_map_copy.insert( + std::pair, + std::shared_ptr>( + element.second->column_path(), element.second->DeepClone())); + } + + if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; + return std::shared_ptr(new FileEncryptionProperties( + algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, + new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); +} + +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix( + const std::string& aad_prefix) { + if (aad_prefix.empty()) return this; + + DCHECK(aad_prefix_.empty()); + aad_prefix_ = aad_prefix; + store_aad_prefix_in_file_ = true; + return this; +} + +FileEncryptionProperties::Builder* +FileEncryptionProperties::Builder::disable_store_aad_prefix_storage() { + DCHECK(!aad_prefix_.empty()); + + store_aad_prefix_in_file_ = false; + return this; +} + ColumnEncryptionProperties::ColumnEncryptionProperties( bool encrypted, const std::shared_ptr& column_path, const std::string& key, const std::string& key_metadata) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 7d943d15be51..f3994ecf822b 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -24,7 +24,6 @@ #include #include -#include "arrow/util/logging.h" #include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/schema.h" @@ -104,22 +103,11 @@ class PARQUET_EXPORT ColumnEncryptionProperties { /// The key is cloned, and will be wiped out (array values set to 0) upon completion /// of file writing. /// Caller is responsible for wiping out the input key array. - Builder* key(std::string column_key) { - if (column_key.empty()) return this; - - DCHECK(key_.empty()); - key_ = column_key; - return this; - } + Builder* key(std::string column_key); /// Set a key retrieval metadata. /// use either key_metadata() or key_id(), not both - Builder* key_metadata(const std::string& key_metadata) { - DCHECK(!key_metadata.empty()); - DCHECK(key_metadata_.empty()); - key_metadata_ = key_metadata; - return this; - } + Builder* key_metadata(const std::string& key_metadata); /// Set a key retrieval metadata (converted from String). /// use either key_metadata() or key_id(), not both @@ -197,18 +185,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { /// key metadata for this column the metadata will be ignored, /// the column will be decrypted with this key. /// key length must be either 16, 24 or 32 bytes. - Builder* key(const std::string& key) { - if (key.empty()) return this; - - DCHECK(!key.empty()); - key_ = key; - return this; - } + Builder* key(const std::string& key); - std::shared_ptr build() { - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_)); - } + std::shared_ptr build(); private: const std::shared_ptr column_path_; @@ -225,17 +204,9 @@ class PARQUET_EXPORT ColumnDecryptionProperties { void set_utilized() { utilized_ = true; } - void wipeout_decryption_key() { - if (!key_.empty()) { - std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); - } - } + void wipeout_decryption_key(); - std::shared_ptr DeepClone() { - std::string key_copy = key_; - return std::shared_ptr( - new ColumnDecryptionProperties(column_path_, key_copy)); - } + std::shared_ptr DeepClone(); private: const std::shared_ptr column_path_; @@ -281,14 +252,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// will be wiped out (array values set to 0). /// Caller is responsible for wiping out the input key array. /// param footerKey Key length must be either 16, 24 or 32 bytes. - Builder* footer_key(const std::string footer_key) { - if (footer_key.empty()) { - return this; - } - DCHECK(footer_key_.empty()); - footer_key_ = footer_key; - return this; - } + Builder* footer_key(const std::string footer_key); /// Set explicit column keys (decryption properties). /// Its also possible to set a key retriever on this property object. @@ -299,24 +263,7 @@ class PARQUET_EXPORT FileDecryptionProperties { Builder* column_properties( const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - for (std::pair, - std::shared_ptr> - element : column_properties) { - if (element.second->is_utilized()) { - throw ParquetException("Column properties utilized in another file"); - } - element.second->set_utilized(); - } - - column_properties_ = column_properties; - return this; - } + schema::ColumnPath::CmpColumnPath>& column_properties); /// Set a key retriever callback. Its also possible to /// set explicit footer or column keys on this file property object. @@ -324,13 +271,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// invocation of the retriever callback. /// If an explicit key is available for a footer or a column, /// its key metadata will be ignored. - Builder* key_retriever(const std::shared_ptr& key_retriever) { - if (key_retriever == NULLPTR) return this; - - DCHECK(key_retriever_ == NULLPTR); - key_retriever_ = key_retriever; - return this; - } + Builder* key_retriever(const std::shared_ptr& key_retriever); /// Skip integrity verification of plaintext footers. /// If not called, integrity of plaintext footers will be checked in runtime, @@ -347,23 +288,10 @@ class PARQUET_EXPORT FileDecryptionProperties { /// A must when a prefix is used for file encryption, but not stored in file. /// If AAD prefix is stored in file, it will be compared to the explicitly /// supplied value and an exception will be thrown if they differ. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) { - return this; - } - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - return this; - } + Builder* aad_prefix(const std::string& aad_prefix); /// Set callback for verification of AAD Prefixes stored in file. - Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier) { - if (aad_prefix_verifier == NULLPTR) return this; - - DCHECK(aad_prefix_verifier_ == NULLPTR); - aad_prefix_verifier_ = aad_prefix_verifier; - return this; - } + Builder* aad_prefix_verifier(std::shared_ptr aad_prefix_verifier); /// By default, reading plaintext (unencrypted) files is not /// allowed when using a decryptor @@ -412,23 +340,9 @@ class PARQUET_EXPORT FileDecryptionProperties { return aad_prefix_verifier_; } - void wipeout_decryption_keys() { - if (!footer_key_.empty()) - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); + void wipeout_decryption_keys(); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_decryption_key(); - } - } - - bool is_utilized() { - if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) - return false; - - return utilized_; - } + bool is_utilized(); void set_utilized() { utilized_ = true; } @@ -439,28 +353,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// This method allows to clone identical properties for another file, /// with an option to update the aadPrefix (if newAadPrefix is null, /// aadPrefix will be cloned too) - std::shared_ptr DeepClone(std::string new_aad_prefix = "") { - std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; - - for (std::pair, - std::shared_ptr> - element : column_properties_) { - column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); - } - - if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; - return std::shared_ptr(new FileDecryptionProperties( - footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, - new_aad_prefix, aad_prefix_verifier_, column_properties_map_copy, - plaintext_files_allowed_)); - } + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); private: std::string footer_key_; @@ -520,32 +413,14 @@ class PARQUET_EXPORT FileEncryptionProperties { /// Set a key retrieval metadata. /// use either footer_key_metadata or footer_key_id, not both. - Builder* footer_key_metadata(const std::string& footer_key_metadata) { - if (footer_key_metadata.empty()) return this; - - DCHECK(footer_key_metadata_.empty()); - footer_key_metadata_ = footer_key_metadata; - return this; - } + Builder* footer_key_metadata(const std::string& footer_key_metadata); /// Set the file AAD Prefix. - Builder* aad_prefix(const std::string& aad_prefix) { - if (aad_prefix.empty()) return this; - - DCHECK(aad_prefix_.empty()); - aad_prefix_ = aad_prefix; - store_aad_prefix_in_file_ = true; - return this; - } + Builder* aad_prefix(const std::string& aad_prefix); /// Skip storing AAD Prefix in file. /// If not called, and if AAD Prefix is set, it will be stored. - Builder* disable_store_aad_prefix_storage() { - DCHECK(!aad_prefix_.empty()); - - store_aad_prefix_in_file_ = false; - return this; - } + Builder* disable_store_aad_prefix_storage(); /// Set the list of encrypted columns and their properties (keys etc). /// If not called, all columns will be encrypted with the footer key. @@ -553,23 +428,7 @@ class PARQUET_EXPORT FileEncryptionProperties { Builder* column_properties( const std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { - if (column_properties.size() == 0) return this; - - if (column_properties_.size() != 0) - throw ParquetException("Column properties already set"); - - for (std::pair, - std::shared_ptr> - element : column_properties) { - if (element.second->is_utilized()) { - throw ParquetException("Column properties utilized in another file"); - } - element.second->set_utilized(); - } - column_properties_ = column_properties; - return this; - } + schema::ColumnPath::CmpColumnPath>& column_properties); std::shared_ptr build() { return std::shared_ptr(new FileEncryptionProperties( @@ -607,41 +466,14 @@ class PARQUET_EXPORT FileEncryptionProperties { void set_utilized() { utilized_ = true; } - void wipeout_encryption_keys() { - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_encryption_key(); - } - } + void wipeout_encryption_keys(); /// FileEncryptionProperties object can be used for writing one file only. /// (at the end, keys are wiped out in the memory). /// This method allows to clone identical properties for another file, /// with an option to update the aadPrefix (if newAadPrefix is null, /// aadPrefix will be cloned too) - std::shared_ptr DeepClone(std::string new_aad_prefix = "") { - std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; - - for (std::pair, - std::shared_ptr> - element : column_properties_) { - column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); - } - - if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; - return std::shared_ptr(new FileEncryptionProperties( - algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, - new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); - } + std::shared_ptr DeepClone(std::string new_aad_prefix = ""); private: EncryptionAlgorithm algorithm_; From cbcac607e16cb67fac9cac8360ecbd313b7624b7 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Fri, 7 Jun 2019 22:49:46 +0700 Subject: [PATCH 104/201] post-rebase change --- cpp/src/parquet/file_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index a5cd76e5d749..200b11adf010 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -485,7 +485,7 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, } void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { - PARQUET_THROW_NOT_OK(sink->Write(PARQUET_MAGIC, 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); return WriteFileMetaData(file_metadata, sink); } From ca71babf058c4638c9b3fcfdaefa2b5185f1367c Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sat, 15 Jun 2019 22:08:21 +0700 Subject: [PATCH 105/201] fix isset of column chunk metadata and statistics --- cpp/src/parquet/file_writer.cc | 2 - cpp/src/parquet/metadata.cc | 191 ++++++++++++++++++--------------- 2 files changed, 105 insertions(+), 88 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 200b11adf010..4eb491f9a661 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -319,8 +319,6 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_->wipeout_encryption_keys(); } } - - PARQUET_THROW_NOT_OK(sink_->Close()); } } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ec7524de4a43..b0a82d00a667 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -180,10 +180,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { - is_metadata_set_ = column->__isset.meta_data; - metadata_ = column->meta_data; - - if (column->__isset.crypto_metadata && !is_metadata_set_) { + if (column->__isset.crypto_metadata) { // column metadata is encrypted + is_metadata_set_ = false; format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { @@ -205,9 +203,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { is_metadata_set_ = true; } } + } else { // column metadata is not encrypted + is_metadata_set_ = true; } if (is_metadata_set_) { - for (auto encoding : metadata_.encodings) { + const format::ColumnMetaData& meta_data = GetMetadataIfSet(); + for (auto encoding : meta_data.encodings) { encodings_.push_back(FromThrift(encoding)); } } @@ -234,12 +235,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats - auto metadata = GetMetadataIfSet(); - if (!metadata.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { + const format::ColumnMetaData& meta_data = GetMetadataIfSet(); + if (!meta_data.__isset.statistics || descr_->sort_order() == SortOrder::UNKNOWN) { return false; } if (possible_stats_ == nullptr) { - possible_stats_ = MakeColumnStats(metadata, descr_); + possible_stats_ = MakeColumnStats(meta_data, descr_); } EncodedStatistics encodedStatistics = possible_stats_->Encode(); return writer_version_->HasCorrectStatistics(type(), encodedStatistics, @@ -304,12 +305,17 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { - if (!is_metadata_set_) { - throw ParquetException( - "Cannot decrypt ColumnMetadata. " - "FileDecryptionProperties must be provided."); + if (column_->__isset.crypto_metadata) { + if (!is_metadata_set_) { + throw ParquetException( + "Cannot decrypt ColumnMetadata. " + "FileDecryptionProperties must be provided."); + } else { + return metadata_; + } + } else { + return column_->meta_data; } - return metadata_; } }; @@ -908,7 +914,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { // column metadata void SetStatistics(const EncodedStatistics& val) { - column_metadata_.__set_statistics(ToThrift(val)); + column_chunk_->meta_data.__set_statistics(ToThrift(val)); } void Finish(int64_t num_values, int64_t dictionary_page_offset, @@ -916,19 +922,19 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback) { if (dictionary_page_offset > 0) { - column_metadata_.__set_dictionary_page_offset(dictionary_page_offset); + column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); } else { column_chunk_->__set_file_offset(data_page_offset + compressed_size); } - - column_metadata_.__set_num_values(num_values); + column_chunk_->__isset.meta_data = true; + column_chunk_->meta_data.__set_num_values(num_values); if (index_page_offset >= 0) { - column_metadata_.__set_index_page_offset(index_page_offset); + column_chunk_->meta_data.__set_index_page_offset(index_page_offset); } - column_metadata_.__set_data_page_offset(data_page_offset); - column_metadata_.__set_total_uncompressed_size(uncompressed_size); - column_metadata_.__set_total_compressed_size(compressed_size); + column_chunk_->meta_data.__set_data_page_offset(data_page_offset); + column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size); + column_chunk_->meta_data.__set_total_compressed_size(compressed_size); std::vector thrift_encodings; if (has_dictionary) { @@ -947,14 +953,38 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { if (dictionary_fallback) { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } - column_metadata_.__set_encodings(thrift_encodings); + column_chunk_->meta_data.__set_encodings(thrift_encodings); + } + + void WriteTo(::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor) { + ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); + // column is unencrypted if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); - } else { - column_chunk_->__isset.crypto_metadata = true; + serializer.Serialize(column_chunk_, sink); + } else { // column is encrypted + // copy column chunk, except for meta_data + format::ColumnChunk column_chunk; + column_chunk.__set_file_offset(column_chunk_->file_offset); + if (column_chunk_->__isset.file_path) { + column_chunk.__set_file_path(column_chunk_->file_path); + } + if (column_chunk_->__isset.offset_index_offset) { + column_chunk.__set_offset_index_offset(column_chunk_->offset_index_offset); + } + if (column_chunk_->__isset.offset_index_length) { + column_chunk.__set_offset_index_length(column_chunk_->offset_index_length); + } + if (column_chunk_->__isset.column_index_offset) { + column_chunk.__set_column_index_offset(column_chunk_->column_index_offset); + } + if (column_chunk_->__isset.column_index_length) { + column_chunk.__set_column_index_length(column_chunk_->column_index_length); + } + + column_chunk.__isset.crypto_metadata = true; format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { // encrypted with footer key @@ -967,62 +997,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk_->__set_crypto_metadata(ccmd); + column_chunk.__set_crypto_metadata(ccmd); bool encrypted_footer = properties_->file_encryption_properties()->encrypted_footer(); bool encrypt_metadata = !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); - if (!encrypt_metadata) { - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(column_metadata_); - } else if (!encrypted_footer) { - // Keep redacted metadata version for old readers - format::ColumnMetaData metadata_redacted; - metadata_redacted.__set_type(column_metadata_.type); - metadata_redacted.__set_encodings(column_metadata_.encodings); - metadata_redacted.__set_path_in_schema(column_metadata_.path_in_schema); - metadata_redacted.__set_codec(column_metadata_.codec); - metadata_redacted.__set_num_values(column_metadata_.num_values); - metadata_redacted.__set_total_uncompressed_size( - column_metadata_.total_uncompressed_size); - metadata_redacted.__set_total_compressed_size( - column_metadata_.total_compressed_size); - if (column_metadata_.__isset.key_value_metadata) { - metadata_redacted.__isset.key_value_metadata = true; - metadata_redacted.__set_key_value_metadata(column_metadata_.key_value_metadata); - } - metadata_redacted.__set_data_page_offset(column_metadata_.data_page_offset); - if (column_metadata_.__isset.index_page_offset) { - metadata_redacted.__isset.index_page_offset = true; - metadata_redacted.__set_index_page_offset(column_metadata_.index_page_offset); - } - if (column_metadata_.__isset.dictionary_page_offset) { - metadata_redacted.__isset.dictionary_page_offset = true; - metadata_redacted.__set_dictionary_page_offset( - column_metadata_.dictionary_page_offset); - } - metadata_redacted.__isset.statistics = false; - metadata_redacted.__isset.encoding_stats = false; - - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(metadata_redacted); - } - } - } - - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - ThriftSerializer serializer; - - const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); - // column is unencrypted - if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { - serializer.Serialize(column_chunk_, sink); - } else { // column is encrypted - bool encrypt_metadata = encryptor->encryptColumnMetaData( - properties_->file_encryption_properties()->encrypted_footer(), encrypt_md); - if (encrypt_metadata) { // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, @@ -1030,7 +1010,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { uint8_t* serialized_data; uint32_t serialized_len; - serializer.SerializeToBuffer(&column_metadata_, &serialized_len, + serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len, &serialized_data); std::vector encrypted_data(encryptor->CiphertextSizeDelta() + @@ -1041,28 +1021,67 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); - column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); + column_chunk.__set_encrypted_column_metadata(encrypted_column_metadata); + + if (encrypted_footer) { + column_chunk.__isset.meta_data = false; + } else { + // Keep redacted metadata version for old readers + format::ColumnMetaData metadata_redacted; + metadata_redacted.__set_type(column_chunk_->meta_data.type); + metadata_redacted.__set_encodings(column_chunk_->meta_data.encodings); + metadata_redacted.__set_path_in_schema(column_chunk_->meta_data.path_in_schema); + metadata_redacted.__set_codec(column_chunk_->meta_data.codec); + metadata_redacted.__set_num_values(column_chunk_->meta_data.num_values); + metadata_redacted.__set_total_uncompressed_size( + column_chunk_->meta_data.total_uncompressed_size); + metadata_redacted.__set_total_compressed_size( + column_chunk_->meta_data.total_compressed_size); + if (column_chunk_->meta_data.__isset.key_value_metadata) { + metadata_redacted.__isset.key_value_metadata = true; + metadata_redacted.__set_key_value_metadata( + column_chunk_->meta_data.key_value_metadata); + } + metadata_redacted.__set_data_page_offset( + column_chunk_->meta_data.data_page_offset); + if (column_chunk_->meta_data.__isset.index_page_offset) { + metadata_redacted.__isset.index_page_offset = true; + metadata_redacted.__set_index_page_offset( + column_chunk_->meta_data.index_page_offset); + } + if (column_chunk_->meta_data.__isset.dictionary_page_offset) { + metadata_redacted.__isset.dictionary_page_offset = true; + metadata_redacted.__set_dictionary_page_offset( + column_chunk_->meta_data.dictionary_page_offset); + } + metadata_redacted.__isset.statistics = false; + metadata_redacted.__isset.encoding_stats = false; + + column_chunk.__isset.meta_data = true; + column_chunk.__set_meta_data(metadata_redacted); + } } - serializer.Serialize(column_chunk_, sink); + serializer.Serialize(&column_chunk, sink); } } const ColumnDescriptor* descr() const { return column_; } - int64_t total_compressed_size() const { return column_metadata_.total_compressed_size; } + int64_t total_compressed_size() const { + return column_chunk_->meta_data.total_compressed_size; + } private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; - column_metadata_ = column_chunk_->meta_data; - column_metadata_.__set_type(ToThrift(column_->physical_type())); - column_metadata_.__set_path_in_schema(column_->path()->ToDotVector()); - column_metadata_.__set_codec(ToThrift(properties_->compression(column_->path()))); + column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type())); + column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector()); + column_chunk_->meta_data.__set_codec( + ToThrift(properties_->compression(column_->path()))); } format::ColumnChunk* column_chunk_; std::unique_ptr owned_column_chunk_; - format::ColumnMetaData column_metadata_; const std::shared_ptr properties_; const ColumnDescriptor* column_; }; From 8c2d44959dd2e5a167a5d74f605efd485169f352 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 16 Jun 2019 16:11:33 +0700 Subject: [PATCH 106/201] temporarily remove encryption-metadata-test --- cpp/src/parquet/CMakeLists.txt | 1 - cpp/src/parquet/encryption-metadata-test.cc | 495 -------------------- 2 files changed, 496 deletions(-) delete mode 100644 cpp/src/parquet/encryption-metadata-test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 68f2ae7ec17a..77de75553f0c 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -323,7 +323,6 @@ add_parquet_test(arrow-test if(PARQUET_BUILD_ENCRYPTION) add_parquet_test(encryption-test) - add_parquet_test(encryption-metadata-test) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-metadata-test.cc b/cpp/src/parquet/encryption-metadata-test.cc deleted file mode 100644 index 7ff31f9b2122..000000000000 --- a/cpp/src/parquet/encryption-metadata-test.cc +++ /dev/null @@ -1,495 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/metadata.h" - -#include - -#include "parquet/properties.h" -#include "parquet/schema.h" -#include "parquet/statistics.h" - -namespace parquet { - -namespace metadata { - -const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 -const char kColumnEncryptionKey1[] = "1234567890123450"; -// const char kColumnEncryptionKey2[] = "1234567890123451"; - -TEST(Metadata, UniformEncryption) { - parquet::schema::NodeVector fields; - parquet::schema::NodePtr root; - parquet::SchemaDescriptor schema; - - fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); - fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); - root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); - schema.Init(root); - - int64_t nrows = 1000; - int32_t int_min = 100, int_max = 200; - EncodedStatistics stats_int; - stats_int.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&int_min), 4)) - .set_max(std::string(reinterpret_cast(&int_max), 4)); - EncodedStatistics stats_float; - float float_min = 100.100f, float_max = 200.200f; - stats_float.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&float_min), 4)) - .set_max(std::string(reinterpret_cast(&float_max), 4)); - - FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); - encryption_prop_builder.footer_key_metadata("kf"); - - WriterProperties::Builder writer_prop_builder; - writer_prop_builder.version(ParquetVersion::PARQUET_2_0); - writer_prop_builder.encryption(encryption_prop_builder.build()); - auto props = writer_prop_builder.build(); - - auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto rg1_builder = f_builder->AppendRowGroup(); - - // Write the metadata - // rowgroup1 metadata - auto col1_builder = rg1_builder->NextColumnChunk(); - auto col2_builder = rg1_builder->NextColumnChunk(); - // column metadata - stats_int.set_is_signed(true); - col1_builder->SetStatistics(stats_int); - stats_float.set_is_signed(true); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); - - rg1_builder->set_num_rows(nrows / 2); - rg1_builder->Finish(1024); - - // rowgroup2 metadata - auto rg2_builder = f_builder->AppendRowGroup(); - col1_builder = rg2_builder->NextColumnChunk(); - col2_builder = rg2_builder->NextColumnChunk(); - // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); - - rg2_builder->set_num_rows(nrows / 2); - rg2_builder->Finish(1024); - - // Read the metadata - auto f_accessor = f_builder->Finish(); - - ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); - - auto file_crypto_metadata = f_builder->GetCryptoMetaData(); - ASSERT_EQ(true, file_crypto_metadata != NULLPTR); - - // file metadata - ASSERT_EQ(nrows, f_accessor->num_rows()); - ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(2, f_accessor->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); - ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); - ASSERT_EQ(3, f_accessor->num_schema_elements()); - - // row group1 metadata - auto rg1_accessor = f_accessor->RowGroup(0); - ASSERT_EQ(2, rg1_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); - ASSERT_EQ(1024, rg1_accessor->total_byte_size()); - - auto rg1_column1 = rg1_accessor->ColumnChunk(0); - auto rg1_column2 = rg1_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg1_column1->is_stats_set()); - ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(0, rg1_column1->statistics()->null_count()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); - ASSERT_EQ(nrows / 2, rg1_column1->num_values()); - ASSERT_EQ(nrows / 2, rg1_column2->num_values()); - ASSERT_EQ(3, rg1_column1->encodings().size()); - ASSERT_EQ(3, rg1_column2->encodings().size()); - ASSERT_EQ(512, rg1_column1->total_compressed_size()); - ASSERT_EQ(512, rg1_column2->total_compressed_size()); - ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); - ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); - ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); - ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); - ASSERT_EQ(10, rg1_column1->data_page_offset()); - ASSERT_EQ(30, rg1_column2->data_page_offset()); - - auto rg2_accessor = f_accessor->RowGroup(1); - ASSERT_EQ(2, rg2_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); - ASSERT_EQ(1024, rg2_accessor->total_byte_size()); - - auto rg2_column1 = rg2_accessor->ColumnChunk(0); - auto rg2_column2 = rg2_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg2_column1->is_stats_set()); - ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(0, rg2_column1->statistics()->null_count()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); - ASSERT_EQ(nrows / 2, rg2_column1->num_values()); - ASSERT_EQ(nrows / 2, rg2_column2->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); - ASSERT_EQ(3, rg2_column1->encodings().size()); - ASSERT_EQ(3, rg2_column2->encodings().size()); - ASSERT_EQ(512, rg2_column1->total_compressed_size()); - ASSERT_EQ(512, rg2_column2->total_compressed_size()); - ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); - ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); - ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); - ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); - ASSERT_EQ(10, rg2_column1->data_page_offset()); - ASSERT_EQ(26, rg2_column2->data_page_offset()); -} - -TEST(Metadata, EncryptFooterAndOneColumn) { - parquet::schema::NodeVector fields; - parquet::schema::NodePtr root; - parquet::SchemaDescriptor schema; - - fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); - fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); - root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); - schema.Init(root); - - int64_t nrows = 1000; - int32_t int_min = 100, int_max = 200; - EncodedStatistics stats_int; - stats_int.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&int_min), 4)) - .set_max(std::string(reinterpret_cast(&int_max), 4)); - EncodedStatistics stats_float; - float float_min = 100.100f, float_max = 200.200f; - stats_float.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&float_min), 4)) - .set_max(std::string(reinterpret_cast(&float_max), 4)); - - std::shared_ptr int_col_path = - parquet::schema::ColumnPath::FromDotString("int_col"); - ColumnEncryptionProperties::Builder int_col_builder(int_col_path); - int_col_builder.key(kColumnEncryptionKey1); - int_col_builder.key_id("kc1"); - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - encryption_col_props; - encryption_col_props[int_col_path] = int_col_builder.build(); - - FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); - encryption_prop_builder.footer_key_metadata("kf"); - encryption_prop_builder.column_properties(encryption_col_props); - - WriterProperties::Builder writer_prop_builder; - writer_prop_builder.version(ParquetVersion::PARQUET_2_0); - writer_prop_builder.encryption(encryption_prop_builder.build()); - auto props = writer_prop_builder.build(); - - auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto rg1_builder = f_builder->AppendRowGroup(); - - // Write the metadata - // rowgroup1 metadata - auto col1_builder = rg1_builder->NextColumnChunk(); - auto col2_builder = rg1_builder->NextColumnChunk(); - // column metadata - stats_int.set_is_signed(true); - col1_builder->SetStatistics(stats_int); - stats_float.set_is_signed(true); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); - - rg1_builder->set_num_rows(nrows / 2); - rg1_builder->Finish(1024); - - // rowgroup2 metadata - auto rg2_builder = f_builder->AppendRowGroup(); - col1_builder = rg2_builder->NextColumnChunk(); - col2_builder = rg2_builder->NextColumnChunk(); - // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); - - rg2_builder->set_num_rows(nrows / 2); - rg2_builder->Finish(1024); - - // Read the metadata - auto f_accessor = f_builder->Finish(); - - ASSERT_EQ(false, f_accessor->is_encryption_algorithm_set()); - - auto file_crypto_metadata = f_builder->GetCryptoMetaData(); - ASSERT_EQ(true, file_crypto_metadata != NULLPTR); - - // file metadata - ASSERT_EQ(nrows, f_accessor->num_rows()); - ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(2, f_accessor->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); - ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); - ASSERT_EQ(3, f_accessor->num_schema_elements()); - - // row group1 metadata - auto rg1_accessor = f_accessor->RowGroup(0); - ASSERT_EQ(2, rg1_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); - ASSERT_EQ(1024, rg1_accessor->total_byte_size()); - - auto rg1_column1 = rg1_accessor->ColumnChunk(0); - auto rg1_column2 = rg1_accessor->ColumnChunk(1); - ASSERT_EQ(false, rg1_column1->is_metadata_set()); - ASSERT_THROW(rg1_column1->is_stats_set(), ParquetException); - ASSERT_THROW(rg1_column1->statistics(), ParquetException); - ASSERT_THROW(rg1_column1->compression(), ParquetException); - ASSERT_THROW(rg1_column1->num_values(), ParquetException); - ASSERT_THROW(rg1_column1->encodings(), ParquetException); - ASSERT_THROW(rg1_column1->total_compressed_size(), ParquetException); - ASSERT_THROW(rg1_column1->total_uncompressed_size(), ParquetException); - ASSERT_THROW(rg1_column1->dictionary_page_offset(), ParquetException); - ASSERT_THROW(rg1_column1->data_page_offset(), ParquetException); - - ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); - ASSERT_EQ(nrows / 2, rg1_column2->num_values()); - ASSERT_EQ(3, rg1_column2->encodings().size()); - ASSERT_EQ(512, rg1_column2->total_compressed_size()); - ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); - ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); - ASSERT_EQ(30, rg1_column2->data_page_offset()); - - auto rg2_accessor = f_accessor->RowGroup(1); - ASSERT_EQ(2, rg2_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); - ASSERT_EQ(1024, rg2_accessor->total_byte_size()); - - auto rg2_column1 = rg2_accessor->ColumnChunk(0); - auto rg2_column2 = rg2_accessor->ColumnChunk(1); - ASSERT_EQ(false, rg1_column1->is_metadata_set()); - ASSERT_THROW(rg2_column1->is_stats_set(), ParquetException); - ASSERT_THROW(rg2_column1->statistics(), ParquetException); - ASSERT_THROW(rg2_column1->compression(), ParquetException); - ASSERT_THROW(rg2_column1->num_values(), ParquetException); - ASSERT_THROW(rg2_column1->encodings(), ParquetException); - ASSERT_THROW(rg2_column1->total_compressed_size(), ParquetException); - ASSERT_THROW(rg2_column1->total_uncompressed_size(), ParquetException); - ASSERT_THROW(rg2_column1->dictionary_page_offset(), ParquetException); - ASSERT_THROW(rg2_column1->data_page_offset(), ParquetException); - - ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); - ASSERT_EQ(nrows / 2, rg2_column2->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); - ASSERT_EQ(3, rg2_column2->encodings().size()); - ASSERT_EQ(512, rg2_column2->total_compressed_size()); - ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); - ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); - ASSERT_EQ(26, rg2_column2->data_page_offset()); -} - -TEST(Metadata, PlaintextFooter) { - parquet::schema::NodeVector fields; - parquet::schema::NodePtr root; - parquet::SchemaDescriptor schema; - - fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); - fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); - root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields); - schema.Init(root); - - int64_t nrows = 1000; - int32_t int_min = 100, int_max = 200; - EncodedStatistics stats_int; - stats_int.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&int_min), 4)) - .set_max(std::string(reinterpret_cast(&int_max), 4)); - EncodedStatistics stats_float; - float float_min = 100.100f, float_max = 200.200f; - stats_float.set_null_count(0) - .set_distinct_count(nrows) - .set_min(std::string(reinterpret_cast(&float_min), 4)) - .set_max(std::string(reinterpret_cast(&float_max), 4)); - - std::shared_ptr int_col_path = - parquet::schema::ColumnPath::FromDotString("int_col"); - ColumnEncryptionProperties::Builder int_col_builder(int_col_path); - int_col_builder.key(kColumnEncryptionKey1); - int_col_builder.key_id("kc1"); - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - encryption_col_props; - encryption_col_props[int_col_path] = int_col_builder.build(); - - FileEncryptionProperties::Builder encryption_prop_builder(kFooterEncryptionKey); - encryption_prop_builder.footer_key_metadata("kf"); - encryption_prop_builder.set_plaintext_footer(); - encryption_prop_builder.column_properties(encryption_col_props); - - WriterProperties::Builder writer_prop_builder; - writer_prop_builder.version(ParquetVersion::PARQUET_2_0); - writer_prop_builder.encryption(encryption_prop_builder.build()); - auto props = writer_prop_builder.build(); - - auto f_builder = FileMetaDataBuilder::Make(&schema, props); - auto rg1_builder = f_builder->AppendRowGroup(); - - // Write the metadata - // rowgroup1 metadata - auto col1_builder = rg1_builder->NextColumnChunk(); - auto col2_builder = rg1_builder->NextColumnChunk(); - // column metadata - stats_int.set_is_signed(true); - col1_builder->SetStatistics(stats_int); - stats_float.set_is_signed(true); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); - - rg1_builder->set_num_rows(nrows / 2); - rg1_builder->Finish(1024); - - // rowgroup2 metadata - auto rg2_builder = f_builder->AppendRowGroup(); - col1_builder = rg2_builder->NextColumnChunk(); - col2_builder = rg2_builder->NextColumnChunk(); - // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); - col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); - col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); - - rg2_builder->set_num_rows(nrows / 2); - rg2_builder->Finish(1024); - - // Read the metadata - auto f_accessor = f_builder->Finish(); - - ASSERT_EQ(true, f_accessor->is_encryption_algorithm_set()); - - auto file_crypto_metadata = f_builder->GetCryptoMetaData(); - ASSERT_EQ(NULLPTR, file_crypto_metadata); - - // file metadata - ASSERT_EQ(nrows, f_accessor->num_rows()); - ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(2, f_accessor->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); - ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); - ASSERT_EQ(3, f_accessor->num_schema_elements()); - - // row group1 metadata - auto rg1_accessor = f_accessor->RowGroup(0); - ASSERT_EQ(2, rg1_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); - ASSERT_EQ(1024, rg1_accessor->total_byte_size()); - - auto rg1_column1 = rg1_accessor->ColumnChunk(0); - auto rg1_column2 = rg1_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg1_column1->is_metadata_set()); - ASSERT_EQ(false, rg1_column1->is_stats_set()); - ASSERT_EQ(NULLPTR, rg1_column1->statistics()); - // ASSERT_THROW(rg1_column1->encodings(), ParquetException); - - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); - ASSERT_EQ(nrows / 2, rg1_column1->num_values()); - ASSERT_EQ(3, rg1_column1->encodings().size()); - ASSERT_EQ(512, rg1_column1->total_compressed_size()); - ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); - ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); - ASSERT_EQ(10, rg1_column1->data_page_offset()); - - ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); - ASSERT_EQ(nrows / 2, rg1_column2->num_values()); - ASSERT_EQ(3, rg1_column2->encodings().size()); - ASSERT_EQ(512, rg1_column2->total_compressed_size()); - ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); - ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); - ASSERT_EQ(30, rg1_column2->data_page_offset()); - - auto rg2_accessor = f_accessor->RowGroup(1); - ASSERT_EQ(2, rg2_accessor->num_columns()); - ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); - ASSERT_EQ(1024, rg2_accessor->total_byte_size()); - - auto rg2_column1 = rg2_accessor->ColumnChunk(0); - auto rg2_column2 = rg2_accessor->ColumnChunk(1); - ASSERT_EQ(true, rg2_column1->is_metadata_set()); - ASSERT_EQ(false, rg2_column1->is_stats_set()); - ASSERT_EQ(NULLPTR, rg2_column1->statistics()); - // ASSERT_THROW(rg2_column1->encodings(), ParquetException); - - ASSERT_EQ(nrows / 2, rg2_column1->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); - ASSERT_EQ(3, rg2_column1->encodings().size()); - ASSERT_EQ(512, rg2_column1->total_compressed_size()); - ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); - ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); - ASSERT_EQ(10, rg2_column1->data_page_offset()); - - ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); - ASSERT_EQ(nrows / 2, rg2_column2->num_values()); - ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); - ASSERT_EQ(3, rg2_column2->encodings().size()); - ASSERT_EQ(512, rg2_column2->total_compressed_size()); - ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); - ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); - ASSERT_EQ(26, rg2_column2->data_page_offset()); -} - -} // namespace metadata -} // namespace parquet From d6f30e11f034985b72b02aaada73f2d89e8d7bbc Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 17 Jun 2019 15:53:08 +0700 Subject: [PATCH 107/201] fix windows compiling issue --- cpp/src/parquet/encryption.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index f3994ecf822b..2ffe36ec73fc 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -85,7 +85,7 @@ class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { class PARQUET_EXPORT ColumnEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { @@ -172,7 +172,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT ColumnDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) @@ -233,7 +233,7 @@ class PARQUET_EXPORT AADPrefixVerifier { class PARQUET_EXPORT FileDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: Builder() { check_plaintext_footer_integrity_ = kDefaultCheckSignature; @@ -384,7 +384,7 @@ class PARQUET_EXPORT FileDecryptionProperties { class PARQUET_EXPORT FileEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: explicit Builder(const std::string& footer_key) : parquet_cipher_(kDefaultEncryptionAlgorithm), From 512fd1fb6c044628818fc402392472daf1f2541a Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 17 Jun 2019 23:23:29 +0700 Subject: [PATCH 108/201] fix issue of parquet-encryption-example --- cpp/src/parquet/column_writer.cc | 10 ++--- cpp/src/parquet/metadata.cc | 75 ++++++++++++-------------------- cpp/src/parquet/metadata.h | 6 +-- 3 files changed, 37 insertions(+), 54 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 9c82db5a046a..01208dad7bdb 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -235,17 +235,17 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { - // index_page_offset = -1 since they are not supported - metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, - total_compressed_size_, total_uncompressed_size_, has_dictionary, - fallback); if (meta_encryptor_ != nullptr) { meta_encryptor_->update_aad(encryption::CreateModuleAad( meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, column_ordinal_, (int16_t)-1)); } + // index_page_offset = -1 since they are not supported + metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, + total_compressed_size_, total_uncompressed_size_, has_dictionary, + fallback, meta_encryptor_); // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get(), meta_encryptor_); + metadata_->WriteTo(sink_.get()); } /** diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index b0a82d00a667..5be027214224 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -181,10 +181,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { if (column->__isset.crypto_metadata) { // column metadata is encrypted - is_metadata_set_ = false; format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + is_metadata_set_ = false; if (file_decryptor != NULLPTR && file_decryptor->properties() != NULLPTR) { // should decrypt metadata std::shared_ptr path = std::make_shared( @@ -199,9 +199,11 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &metadata_, decryptor, false); + &len, &decrypted_metadata_, decryptor, false); is_metadata_set_ = true; } + } else { + is_metadata_set_ = true; } } else { // column metadata is not encrypted is_metadata_set_ = true; @@ -299,19 +301,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { mutable std::shared_ptr possible_stats_; std::vector encodings_; const format::ColumnChunk* column_; - format::ColumnMetaData metadata_; + format::ColumnMetaData decrypted_metadata_; const ColumnDescriptor* descr_; const ApplicationVersion* writer_version_; bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { - if (column_->__isset.crypto_metadata) { + if (column_->__isset.crypto_metadata + && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { throw ParquetException( "Cannot decrypt ColumnMetadata. " "FileDecryptionProperties must be provided."); } else { - return metadata_; + return decrypted_metadata_; } } else { return column_->meta_data; @@ -920,7 +923,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -954,37 +957,11 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encodings.push_back(ToThrift(Encoding::PLAIN)); } column_chunk_->meta_data.__set_encodings(thrift_encodings); - } - - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - ThriftSerializer serializer; const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); - // column is unencrypted - if (encrypt_md == NULLPTR || !encrypt_md->is_encrypted()) { - serializer.Serialize(column_chunk_, sink); - } else { // column is encrypted - // copy column chunk, except for meta_data - format::ColumnChunk column_chunk; - column_chunk.__set_file_offset(column_chunk_->file_offset); - if (column_chunk_->__isset.file_path) { - column_chunk.__set_file_path(column_chunk_->file_path); - } - if (column_chunk_->__isset.offset_index_offset) { - column_chunk.__set_offset_index_offset(column_chunk_->offset_index_offset); - } - if (column_chunk_->__isset.offset_index_length) { - column_chunk.__set_offset_index_length(column_chunk_->offset_index_length); - } - if (column_chunk_->__isset.column_index_offset) { - column_chunk.__set_column_index_offset(column_chunk_->column_index_offset); - } - if (column_chunk_->__isset.column_index_length) { - column_chunk.__set_column_index_length(column_chunk_->column_index_length); - } - - column_chunk.__isset.crypto_metadata = true; + // column is encrypted + if (encrypt_md != NULLPTR && encrypt_md->is_encrypted()) { + column_chunk_->__isset.crypto_metadata = true; format::ColumnCryptoMetaData ccmd; if (encrypt_md->is_encrypted_with_footer_key()) { // encrypted with footer key @@ -997,13 +974,14 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk.__set_crypto_metadata(ccmd); + column_chunk_->__set_crypto_metadata(ccmd); bool encrypted_footer = properties_->file_encryption_properties()->encrypted_footer(); bool encrypt_metadata = !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key(); if (encrypt_metadata) { + ThriftSerializer serializer; // Serialize and encrypt ColumnMetadata separately // Thrift-serialize the ColumnMetaData structure, // encrypt it with the column key, and write to encrypted_column_metadata @@ -1021,10 +999,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const char* temp = const_cast(reinterpret_cast(encrypted_data.data())); std::string encrypted_column_metadata(temp, encrypted_len); - column_chunk.__set_encrypted_column_metadata(encrypted_column_metadata); + column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata); if (encrypted_footer) { - column_chunk.__isset.meta_data = false; + column_chunk_->__isset.meta_data = false; } else { // Keep redacted metadata version for old readers format::ColumnMetaData metadata_redacted; @@ -1057,14 +1035,18 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { metadata_redacted.__isset.statistics = false; metadata_redacted.__isset.encoding_stats = false; - column_chunk.__isset.meta_data = true; - column_chunk.__set_meta_data(metadata_redacted); + column_chunk_->__isset.meta_data = true; + column_chunk_->__set_meta_data(metadata_redacted); } } - serializer.Serialize(&column_chunk, sink); } } + void WriteTo(::arrow::io::OutputStream* sink) { + ThriftSerializer serializer; + serializer.Serialize(column_chunk_, sink); + } + const ColumnDescriptor* descr() const { return column_; } int64_t total_compressed_size() const { return column_chunk_->meta_data.total_compressed_size; @@ -1124,14 +1106,15 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback) { + bool dictionary_fallback, + const std::shared_ptr& encryptor) { impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, dictionary_fallback); + compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, + encryptor); } -void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor) { - impl_->WriteTo(sink, encryptor); +void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { + impl_->WriteTo(sink); } const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index aaeaa322b5bb..89ea24527c54 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -296,14 +296,14 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); + bool dictionary_fallback, + const std::shared_ptr& encryptor = NULLPTR); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; // For writing metadata at end of column chunk - void WriteTo(::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR); + void WriteTo(::arrow::io::OutputStream* sink); private: explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, From e73893286875e1f4b6667154dade2dc833755ea9 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 18 Jun 2019 21:06:31 +0700 Subject: [PATCH 109/201] rename encryption-test.cc to encryption-properties-test.cc --- cpp/src/parquet/CMakeLists.txt | 4 +++- .../{encryption-test.cc => encryption-properties-test.cc} | 0 2 files changed, 3 insertions(+), 1 deletion(-) rename cpp/src/parquet/{encryption-test.cc => encryption-properties-test.cc} (100%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 77de75553f0c..0716dc5a79e9 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -322,7 +322,9 @@ add_parquet_test(arrow-test test_util.cc) if(PARQUET_BUILD_ENCRYPTION) - add_parquet_test(encryption-test) + add_parquet_test(encryption-test + SOURCES + encryption-properties-test.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-test.cc b/cpp/src/parquet/encryption-properties-test.cc similarity index 100% rename from cpp/src/parquet/encryption-test.cc rename to cpp/src/parquet/encryption-properties-test.cc From 85c08a5e0d0507ce0ca8c3be3710ef1218f637af Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 18 Jun 2019 23:07:41 +0700 Subject: [PATCH 110/201] use isset instead of creating a copy of column chunk metadata --- cpp/src/parquet/metadata.cc | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5be027214224..e386eb91def9 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1005,38 +1005,9 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->__isset.meta_data = false; } else { // Keep redacted metadata version for old readers - format::ColumnMetaData metadata_redacted; - metadata_redacted.__set_type(column_chunk_->meta_data.type); - metadata_redacted.__set_encodings(column_chunk_->meta_data.encodings); - metadata_redacted.__set_path_in_schema(column_chunk_->meta_data.path_in_schema); - metadata_redacted.__set_codec(column_chunk_->meta_data.codec); - metadata_redacted.__set_num_values(column_chunk_->meta_data.num_values); - metadata_redacted.__set_total_uncompressed_size( - column_chunk_->meta_data.total_uncompressed_size); - metadata_redacted.__set_total_compressed_size( - column_chunk_->meta_data.total_compressed_size); - if (column_chunk_->meta_data.__isset.key_value_metadata) { - metadata_redacted.__isset.key_value_metadata = true; - metadata_redacted.__set_key_value_metadata( - column_chunk_->meta_data.key_value_metadata); - } - metadata_redacted.__set_data_page_offset( - column_chunk_->meta_data.data_page_offset); - if (column_chunk_->meta_data.__isset.index_page_offset) { - metadata_redacted.__isset.index_page_offset = true; - metadata_redacted.__set_index_page_offset( - column_chunk_->meta_data.index_page_offset); - } - if (column_chunk_->meta_data.__isset.dictionary_page_offset) { - metadata_redacted.__isset.dictionary_page_offset = true; - metadata_redacted.__set_dictionary_page_offset( - column_chunk_->meta_data.dictionary_page_offset); - } - metadata_redacted.__isset.statistics = false; - metadata_redacted.__isset.encoding_stats = false; - column_chunk_->__isset.meta_data = true; - column_chunk_->__set_meta_data(metadata_redacted); + column_chunk_->meta_data.__isset.statistics = false; + column_chunk_->meta_data.__isset.encoding_stats = false; } } } From 4419abc8b934a7b1d37281bebab906d3a790d3a5 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Fri, 21 Jun 2019 06:26:28 +0300 Subject: [PATCH 111/201] Address review comments --- ...yption-reader-writer-all-crypto-options.cc | 31 +- cpp/src/parquet/CMakeLists.txt | 3 +- cpp/src/parquet/column_reader.cc | 128 +-- cpp/src/parquet/column_reader.h | 12 +- cpp/src/parquet/column_writer.cc | 80 +- .../parquet/encryption-configurations-test.cc | 767 ++++++++++++++++++ cpp/src/parquet/encryption.cc | 84 +- cpp/src/parquet/encryption.h | 126 +-- cpp/src/parquet/encryption_internal.cc | 9 +- cpp/src/parquet/encryption_internal.h | 4 +- cpp/src/parquet/file_reader.cc | 333 ++++---- cpp/src/parquet/file_writer.cc | 132 +-- cpp/src/parquet/file_writer.h | 20 +- cpp/src/parquet/internal_file_decryptor.cc | 112 +-- cpp/src/parquet/internal_file_decryptor.h | 28 +- cpp/src/parquet/internal_file_encryptor.cc | 115 +-- cpp/src/parquet/internal_file_encryptor.h | 30 +- cpp/src/parquet/metadata.cc | 6 +- cpp/src/parquet/parquet.pc | 30 - cpp/src/parquet/parquet.thrift | 4 +- cpp/src/parquet/parquet_version.h | 24 - cpp/src/parquet/thrift.h | 109 ++- 22 files changed, 1427 insertions(+), 760 deletions(-) create mode 100644 cpp/src/parquet/encryption-configurations-test.cc delete mode 100644 cpp/src/parquet/parquet.pc delete mode 100644 cpp/src/parquet/parquet_version.h diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 5241e559caea..069f997d1ba5 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -27,7 +27,10 @@ /* * This file contains samples for writing and reading encrypted Parquet files in different - * encryption and decryption configurations. The samples have the following goals: + * encryption and decryption configurations. + * Each sample section is dedicated to an independent configuration and shows its creation + * from beginning to end. + * The samples have the following goals: * 1) Demonstrate usage of different options for data encryption and decryption. * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr) * readers that support encryption. @@ -36,7 +39,7 @@ * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading * encrypted files produced by these writers. * - * The write sample produces number of parquet files, each encrypted with a different + * Each write sample produces new independent parquet file, encrypted with a different * encryption configuration as described below. * The name of each file is in the form of: * tester.parquet.encrypted. @@ -113,7 +116,7 @@ std::vector GetDirectoryFiles(const std::string& path) { return files; } -void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { +void InteropTestWriteEncryptedParquetFiles(std::string root_path) { /********************************************************************************** Creating a number of Encryption configurations **********************************************************************************/ @@ -260,7 +263,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { using FileClass = ::arrow::io::FileOutputStream; std::shared_ptr out_file; std::string file = - rootPath + fileName + std::string(test_number_string) + ".parquet.encrypted"; + root_path + fileName + std::string(test_number_string) + ".parquet.encrypted"; std::cout << "Write " << file << std::endl; PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); @@ -326,8 +329,8 @@ void InteropTestWriteEncryptedParquetFiles(std::string rootPath) { } } -void InteropTestReadEncryptedParquetFiles(std::string rootPath) { - std::vector files_in_directory = GetDirectoryFiles(rootPath); +void InteropTestReadEncryptedParquetFiles(std::string root_path) { + std::vector files_in_directory = GetDirectoryFiles(root_path); /********************************************************************************** Creating a number of Decryption configurations @@ -414,7 +417,7 @@ void InteropTestReadEncryptedParquetFiles(std::string rootPath) { // Create a ParquetReader instance std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(rootPath + file, false, + parquet::ParquetFileReader::OpenFile(root_path + file, false, reader_properties); // Get the File MetaData @@ -611,7 +614,7 @@ bool FileNameEndsWith(std::string file_name, std::string suffix) { int main(int argc, char** argv) { enum Operation { write, read }; - std::string rootPath; + std::string root_path; Operation operation = write; if (argc < 3) { std::cout << "Usage: encryption-reader-writer-all-crypto-options " @@ -619,18 +622,18 @@ int main(int argc, char** argv) { << std::endl; exit(1); } - rootPath = argv[1]; - if (rootPath.compare("read") == 0) { + root_path = argv[1]; + if (root_path.compare("read") == 0) { operation = read; } - rootPath = argv[2]; - std::cout << "Root path is: " << rootPath << std::endl; + root_path = argv[2]; + std::cout << "Root path is: " << root_path << std::endl; if (operation == write) { - InteropTestWriteEncryptedParquetFiles(rootPath); + InteropTestWriteEncryptedParquetFiles(root_path); } else - InteropTestReadEncryptedParquetFiles(rootPath); + InteropTestReadEncryptedParquetFiles(root_path); return 0; } diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 0716dc5a79e9..c715b0c4684d 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -324,7 +324,8 @@ add_parquet_test(arrow-test if(PARQUET_BUILD_ENCRYPTION) add_parquet_test(encryption-test SOURCES - encryption-properties-test.cc) + encryption-properties-test.cc + encryption-configurations-test.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 120f9d929e51..04b48f0af1f8 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -119,36 +119,25 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - bool column_has_dictionary, int16_t row_group_ordinal, - int16_t column_ordinal, ::arrow::MemoryPool* pool, - std::shared_ptr meta_decryptor, - std::shared_ptr data_decryptor) + ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), first_page_(true), - column_has_dictionary_(column_has_dictionary), - row_group_ordinal_(row_group_ordinal), - column_ordinal_(column_ordinal), page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), - decryption_buffer_(AllocateBuffer(pool, 0)), - meta_decryptor_(meta_decryptor), - data_decryptor_(data_decryptor) { + decryption_buffer_(AllocateBuffer(pool, 0)) { + if (ctx != NULLPTR) { + column_has_dictionary_ = ctx->column_has_dictionary; + row_group_ordinal_ = ctx->row_group_ordinal; + column_ordinal_ = ctx->column_ordinal; + meta_decryptor_ = ctx->meta_decryptor; + data_decryptor_ = ctx->data_decryptor; + } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); - if (data_decryptor_ != NULLPTR) { - DCHECK(!data_decryptor_->file_aad().empty()); - // prepare the AAD for quick update later - data_pageAAD_ = - encryption::CreateModuleAad(data_decryptor_->file_aad(), encryption::kDataPage, - row_group_ordinal_, column_ordinal_, (int16_t)-1); - } - if (meta_decryptor_ != NULLPTR) { - DCHECK(!meta_decryptor_->file_aad().empty()); - data_page_headerAAD_ = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { + InitDecryption(); } } @@ -158,6 +147,12 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: + void UpdateDecryption(const std::shared_ptr& decryptor, + bool current_page_is_dictionary, int8_t module_type, + const std::string& pageAAD); + + void InitDecryption(); + std::shared_ptr stream_; format::PageHeader current_page_header_; @@ -167,11 +162,31 @@ class SerializedPageReader : public PageReader { std::unique_ptr<::arrow::util::Codec> decompressor_; std::shared_ptr decompression_buffer_; - bool first_page_; + // The fields below are used for calculation of AAD (additional authenticated data) + // suffix which is part of the Parquet Modular Encryption. + // The AAD suffix for a parquet module is built internally by Parquet, by direct + // concatenation the different parts of the module, which includes amongst other + // its row group ordinal, column ordinal and page ordinal. + // Please refer to the encryption specification for more details: + // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data + + // To calculate the AAD suffix of an encrypted module, the exact type of the module + // should be known. The following two fields indicate whether the page is data or + // dictionary page. + + // Indicates whether the column has dictionary page. bool column_has_dictionary_; + // If the column has dictionary page and the page currently processed in the first + // one then it is a dictionary page. + bool first_page_; + // The ordinal fields below are used for AAD suffix calculation. int16_t row_group_ordinal_; int16_t column_ordinal_; int16_t page_ordinal_; + // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page + // header in a single column respectively. + // While calculating AAD for different pages in a single column the pages AAD is + // updated by only the page ordinal. std::string data_pageAAD_; std::string data_page_headerAAD_; @@ -190,6 +205,38 @@ class SerializedPageReader : public PageReader { std::shared_ptr data_decryptor_; }; +void SerializedPageReader::InitDecryption() { + // Prepare the AAD for quick update later. + if (data_decryptor_ != NULLPTR) { + DCHECK(!data_decryptor_->file_aad().empty()); + data_pageAAD_ = encryption::CreateModuleAad( + data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + if (meta_decryptor_ != NULLPTR) { + DCHECK(!meta_decryptor_->file_aad().empty()); + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } +} + +void SerializedPageReader::UpdateDecryption(const std::shared_ptr& decryptor, + bool current_page_is_dictionary, + int8_t module_type, + const std::string& pageAAD) { + DCHECK(decryptor != NULLPTR); + if (current_page_is_dictionary) { + std::string aad = encryption::CreateModuleAad(decryptor->file_aad(), module_type, + row_group_ordinal_, column_ordinal_, + static_cast(-1)); + decryptor->UpdateAad(aad); + } else { + encryption::QuickUpdatePageAad(pageAAD, page_ordinal_); + decryptor->UpdateAad(pageAAD); + } +} + std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with @@ -223,16 +270,8 @@ std::shared_ptr SerializedPageReader::NextPage() { header_size = static_cast(buffer.size()); try { if (meta_decryptor_ != NULLPTR) { - if (current_page_is_dictionary) { - std::string dictionary_page_header_aad; - dictionary_page_header_aad = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDictionaryPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1); - meta_decryptor_->update_aad(dictionary_page_header_aad); - } else { - encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); - meta_decryptor_->update_aad(data_page_headerAAD_); - } + UpdateDecryption(meta_decryptor_, current_page_is_dictionary, + encryption::kDictionaryPageHeader, data_page_headerAAD_); } DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, meta_decryptor_); @@ -254,17 +293,8 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; if (data_decryptor_ != NULLPTR) { - DCHECK(!data_decryptor_->file_aad().empty()); - if (current_page_is_dictionary) { - std::string dictionary_page_aad; - dictionary_page_aad = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1); - data_decryptor_->update_aad(dictionary_page_aad); - } else { - encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); - data_decryptor_->update_aad(data_pageAAD_); - } + UpdateDecryption(data_decryptor_, current_page_is_dictionary, + encryption::kDictionaryPage, data_pageAAD_); } // Read the compressed data page. @@ -355,13 +385,9 @@ std::shared_ptr SerializedPageReader::NextPage() { std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool, bool column_has_dictionary, - int16_t row_group_ordinal, int16_t column_ordinal, - std::shared_ptr meta_decryptor, - std::shared_ptr data_decryptor) { - return std::unique_ptr(new SerializedPageReader( - stream, total_num_rows, codec, column_has_dictionary, row_group_ordinal, - column_ordinal, pool, meta_decryptor, data_decryptor)); + Compression::type codec, ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) { + return std::unique_ptr( + new SerializedPageReader(stream, total_num_rows, codec, pool, ctx)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 563613f0ccb3..6a180ca9b7e4 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -73,6 +73,14 @@ class PARQUET_EXPORT LevelDecoder { std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; }; +struct PageReaderContext { + bool column_has_dictionary; + int16_t row_group_ordinal; + int16_t column_ordinal; + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; +}; + // Abstract page iterator interface. This way, we can feed column pages to the // ColumnReader through whatever mechanism we choose class PARQUET_EXPORT PageReader { @@ -82,9 +90,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool column_has_dictionary = false, int16_t row_group_ordinal = -1, - int16_t column_ordinal = -1, std::shared_ptr meta_decryptor = NULLPTR, - std::shared_ptr data_decryptor = NULLPTR); + struct PageReaderContext* ctx = NULLPTR); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 01208dad7bdb..1ae33c624999 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -157,16 +157,8 @@ class SerializedPageWriter : public PageWriter { column_ordinal_(column_chunk_ordinal), meta_encryptor_(meta_encryptor), data_encryptor_(data_encryptor) { - if (data_encryptor_ != NULLPTR) { - // prepare the add for quick update later - data_pageAAD_ = - encryption::CreateModuleAad(data_encryptor_->file_aad(), encryption::kDataPage, - row_group_ordinal_, column_ordinal_, (int16_t)-1); - } - if (meta_encryptor_ != NULLPTR) { - data_page_headerAAD_ = encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, - column_ordinal_, (int16_t)-1); + if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { + InitEncryption(); } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -194,9 +186,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { - data_encryptor_->update_aad(encryption::CreateModuleAad( - data_encryptor_->file_aad(), encryption::kDictionaryPage, row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + UpdateEncryption(encryption::kDictionaryPage); encrypted_data_buffer = std::static_pointer_cast(AllocateBuffer( pool_, data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -218,9 +208,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - meta_encryptor_->update_aad(encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kDictionaryPageHeader, - row_group_ordinal_, column_ordinal_, (int16_t)-1)); + UpdateEncryption(encryption::kDictionaryPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); @@ -236,10 +224,9 @@ class SerializedPageWriter : public PageWriter { void Close(bool has_dictionary, bool fallback) override { if (meta_encryptor_ != nullptr) { - meta_encryptor_->update_aad(encryption::CreateModuleAad( - meta_encryptor_->file_aad(), encryption::kColumnMetaData, row_group_ordinal_, - column_ordinal_, (int16_t)-1)); + UpdateEncryption(encryption::kColumnMetaData); } + // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, @@ -286,8 +273,7 @@ class SerializedPageWriter : public PageWriter { std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { - encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); - data_encryptor_->update_aad(data_pageAAD_); + UpdateEncryption(encryption::kDataPage); PARQUET_THROW_NOT_OK(encrypted_data_buffer->Resize( data_encryptor_->CiphertextSizeDelta() + output_data_len)); output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, @@ -309,8 +295,7 @@ class SerializedPageWriter : public PageWriter { } if (meta_encryptor_) { - encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); - meta_encryptor_->update_aad(data_page_headerAAD_); + UpdateEncryption(encryption::kDataPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); @@ -339,6 +324,55 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: + void InitEncryption() { + // Prepare the AAD for quick update later. + if (data_encryptor_ != NULLPTR) { + data_pageAAD_ = encryption::CreateModuleAad( + data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + if (meta_encryptor_ != NULLPTR) { + data_page_headerAAD_ = encryption::CreateModuleAad( + meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, + column_ordinal_, static_cast(-1)); + } + } + + void UpdateEncryption(int8_t module_type) { + switch (module_type) { + case encryption::kColumnMetaData: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + case encryption::kDataPage: { + encryption::QuickUpdatePageAad(data_pageAAD_, page_ordinal_); + data_encryptor_->UpdateAad(data_pageAAD_); + break; + } + case encryption::kDataPageHeader: { + encryption::QuickUpdatePageAad(data_page_headerAAD_, page_ordinal_); + meta_encryptor_->UpdateAad(data_page_headerAAD_); + break; + } + case encryption::kDictionaryPageHeader: { + meta_encryptor_->UpdateAad(encryption::CreateModuleAad( + meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + case encryption::kDictionaryPage: { + data_encryptor_->UpdateAad(encryption::CreateModuleAad( + data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_, + static_cast(-1))); + break; + } + default: + throw ParquetException("Unknown module type in UpdateEncryption"); + } + } + std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; MemoryPool* pool_; diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc new file mode 100644 index 000000000000..2afc20e52234 --- /dev/null +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-util.h" + +/* + * This file contains unit-tests for writing and reading encrypted Parquet files with + * different encryption and decryption configurations. + * + * Each unit-test produces a single parquet file, encrypted with one of the encryption + * configuration described below; and is read multiple times using a set of decryption + * configurations, also described below. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * Each unit-test creates a single parquet file with eight columns using one of the + * following encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + * + * The written parquet file produced above is read by each of the following decryption + * configurations: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + */ + +namespace parquet { + +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; +using parquet::LogicalType; +using parquet::Repetition; +using parquet::Type; + +constexpr int kFixedLength = 10; + +namespace test { + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + rows_per_rowgroup_ = 50; + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + createDecryptionConfigurations(); + path_to_double_field_ = parquet::schema::ColumnPath::FromDotString("double_field"); + path_to_float_field_ = parquet::schema::ColumnPath::FromDotString("float_field"); + } + + protected: + std::shared_ptr path_to_double_field_; + std::shared_ptr path_to_float_field_; + int rows_per_rowgroup_; + std::shared_ptr schema_; + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + void createDecryptionConfigurations() { + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // Decryption configuration 1: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey_); + string_kr2->PutKey("kc1", kColumnEncryptionKey1_); + string_kr2->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( + path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey_) + ->column_properties(decryption_cols) + ->build()); + } + + std::shared_ptr EncryptFile( + std::shared_ptr encryption_configurations) { + auto sink = CreateOutputStream(); + + WriterProperties::Builder prop_builder; + + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + auto file_writer = ParquetFileWriter::Open(sink, schema_, writer_properties); + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(sink->Finish(&buffer)); + return buffer; + } + + void DecryptFile(std::shared_ptr buffer, int example_id, + int encryption_configuration) { + std::string exception_msg; + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[example_id]->DeepClone()); + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + assert(num_row_groups == 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + assert(num_columns == 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + assert(value == expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + assert(value == i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + assert(value == expected_value); + if ((i % 2) == 0) { + assert(repetition_level == 1); + } else { + assert(repetition_level == 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + assert(value.value[j] == expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + assert(value == expected_value); + i++; + } + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + double expected_value = i * 1.1111111; + assert(value == expected_value); + i++; + } + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + assert(values_read == 1); + assert(value.len == kFixedLength); + assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); + assert(definition_level == 1); + } else { + // There are NULL values in the rows written + assert(values_read == 0); + assert(definition_level == 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + assert(rows_read == 1); + // There are no NULL values in the rows written + assert(values_read == 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); + i++; + } + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(encryption_configuration, example_id, exception_msg); + } + + // Check that the decryption result is as expected. + void CheckResult(int encryption_configuration_number, int example_id, + std::string exception_msg) { + int decryption_configuration_number = example_id + 1; + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + ASSERT_FALSE(found == std::string::npos); + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected + // to be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + ASSERT_FALSE(found == std::string::npos); + return; + } + } + if (!exception_msg.empty()) { + ASSERT_EQ(1, 0); + } + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, LogicalType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + LogicalType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + LogicalType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, LogicalType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, LogicalType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration 1: Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST_F(TestEncryptionConfiguration, UniformEncryption) { + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 1 /* encryption_configuration_number */); + } +} + +// Encryption configuration 2: Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 2 /* encryption_configuration_number */); + } +} + +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { + // Encryption configuration 3: Encrypt two columns, with different keys. + // Don’t encrypt footer. + // (plaintext footer mode, readable by legacy readers) + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( + path_to_float_field_); + encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); + encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 3 /* encryption_configuration_number */); + } +} + +// Encryption configuration 4: Encrypt two columns and the footer, with different keys. +// Use aad_prefix. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( + path_to_float_field_); + encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); + encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(kFileName_) + ->build()); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 4 /* encryption_configuration_number */); + } +} + +// Encryption configuration 5: Encrypt two columns and the footer, with different keys. +// Use aad_prefix and disable_aad_prefix_storage. +TEST_F(TestEncryptionConfiguration, + EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( + path_to_float_field_); + encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); + encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_store_aad_prefix_storage() + ->build()); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 5 /* encryption_configuration_number */); + } +} + +// Encryption configuration 6: Encrypt two columns and the footer, with different keys. +// Use AES_GCM_CTR_V1 algorithm. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( + path_to_float_field_); + encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); + encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey_); + + std::shared_ptr buffer = + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(buffer, example_id, 6 /* encryption_configuration_number */); + } +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 096df59b1353..69da7eeae562 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -17,11 +17,12 @@ #include "parquet/encryption.h" -#include #include #include #include +#include + #include "arrow/util/logging.h" #include "arrow/util/utf8.h" @@ -29,7 +30,7 @@ namespace parquet { // integer key retriever void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) { - key_map_.insert(std::make_pair(key_id, key)); + key_map_.insert({key_id, key}); } const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) { @@ -41,7 +42,7 @@ const std::string& IntegerKeyIdRetriever::GetKey(const std::string& key_metadata // string key retriever void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) { - key_map_.insert(std::make_pair(key_id, key)); + key_map_.insert({key_id, key}); } const std::string& StringKeyIdRetriever::GetKey(const std::string& key_id) { @@ -80,17 +81,13 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id } FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { + const ColumnPathToDecryptionPropertiesMap& column_properties) { if (column_properties.size() == 0) return this; if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); - for (std::pair, - std::shared_ptr> - element : column_properties) { + for (const auto& element : column_properties) { if (element.second->is_utilized()) { throw ParquetException("Column properties utilized in another file"); } @@ -101,14 +98,11 @@ FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_pro return this; } -void FileDecryptionProperties::wipeout_decryption_keys() { - if (!footer_key_.empty()) - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); +void FileDecryptionProperties::WipeOutDecryptionKeys() { + footer_key_.clear(); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_decryption_key(); + for (const auto& element : column_properties_) { + element.second->WipeOutDecryptionKey(); } } @@ -122,17 +116,11 @@ bool FileDecryptionProperties::is_utilized() { std::shared_ptr FileDecryptionProperties::DeepClone( std::string new_aad_prefix) { std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; + ColumnPathToDecryptionPropertiesMap column_properties_map_copy; - for (std::pair, - std::shared_ptr> - element : column_properties_) { + for (const auto& element : column_properties_) { column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); + {element.second->column_path(), element.second->DeepClone()}); } if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; @@ -193,11 +181,7 @@ std::shared_ptr ColumnDecryptionProperties::Builder: new ColumnDecryptionProperties(column_path_, key_)); } -void ColumnDecryptionProperties::wipeout_decryption_key() { - if (!key_.empty()) { - std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); - } -} +void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); } std::shared_ptr ColumnDecryptionProperties::DeepClone() { std::string key_copy = key_; @@ -215,17 +199,13 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) { + const ColumnPathToEncryptionPropertiesMap& column_properties) { if (column_properties.size() == 0) return this; if (column_properties_.size() != 0) throw ParquetException("Column properties already set"); - for (std::pair, - std::shared_ptr> - element : column_properties) { + for (const auto& element : column_properties) { if (element.second->is_utilized()) { throw ParquetException("Column properties utilized in another file"); } @@ -235,29 +215,21 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_pro return this; } -void FileEncryptionProperties::wipeout_encryption_keys() { - std::memset((char*)(const_cast(footer_key_.c_str())), 0, footer_key_.size()); - for (std::pair, - std::shared_ptr> - element : column_properties_) { - element.second->wipeout_encryption_key(); +void FileEncryptionProperties::WipeOutEncryptionKeys() { + footer_key_.clear(); + for (const auto& element : column_properties_) { + element.second->WipeOutEncryptionKey(); } } std::shared_ptr FileEncryptionProperties::DeepClone( std::string new_aad_prefix) { std::string footer_key_copy = footer_key_; - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_map_copy; + ColumnPathToEncryptionPropertiesMap column_properties_map_copy; - for (std::pair, - std::shared_ptr> - element : column_properties_) { + for (const auto& element : column_properties_) { column_properties_map_copy.insert( - std::pair, - std::shared_ptr>( - element.second->column_path(), element.second->DeepClone())); + {element.second->column_path(), element.second->DeepClone()}); } if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; @@ -342,9 +314,7 @@ FileDecryptionProperties::FileDecryptionProperties( const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, + const ColumnPathToDecryptionPropertiesMap& column_properties, bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || 0 != column_properties.size()); @@ -393,16 +363,14 @@ std::shared_ptr FileEncryptionProperties::column_pro return column_properties_[column_path]; } - return NULLPTR; + return nullptr; } FileEncryptionProperties::FileEncryptionProperties( ParquetCipher::type cipher, const std::string& footer_key, const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties) + const ColumnPathToEncryptionPropertiesMap& column_properties) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 2ffe36ec73fc..f12db27dbec7 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -24,7 +24,6 @@ #include #include -#include "parquet/encryption.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" @@ -39,6 +38,18 @@ static constexpr bool kDefaultCheckSignature = true; static constexpr bool kDefaultAllowPlaintextFiles = false; static constexpr int32_t kAadFileUniqueLength = 8; +class ColumnDecryptionProperties; +using ColumnPathToDecryptionPropertiesMap = + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>; + +class ColumnEncryptionProperties; +using ColumnPathToEncryptionPropertiesMap = + std::map, + std::shared_ptr, + schema::ColumnPath::CmpColumnPath>; + class PARQUET_EXPORT DecryptionKeyRetriever { public: virtual const std::string& GetKey(const std::string& key_metadata) = 0; @@ -77,15 +88,16 @@ class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; -class PARQUET_EXPORT UnsupportedOperationException : public ParquetException { - public: - explicit UnsupportedOperationException(const std::string& columnPath) - : ParquetException(columnPath.c_str()) {} -}; +inline uint8_t* str2bytes(const std::string& str) { + if (str.empty()) return nullptr; + + char* cbytes = const_cast(str.c_str()); + return reinterpret_cast(cbytes); +} class PARQUET_EXPORT ColumnEncryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { @@ -109,6 +121,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { /// use either key_metadata() or key_id(), not both Builder* key_metadata(const std::string& key_metadata); + /// A convenience function to set key metadata using a string id. /// Set a key retrieval metadata (converted from String). /// use either key_metadata() or key_id(), not both /// key_id will be converted to metadata (UTF-8 array). @@ -134,11 +147,10 @@ class PARQUET_EXPORT ColumnEncryptionProperties { bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; } const std::string& key() const { return key_; } const std::string& key_metadata() const { return key_metadata_; } - void wipeout_encryption_key() { - if (!key_.empty()) { - std::memset((char*)(const_cast(key_.c_str())), 0, key_.size()); - } - } + + /// Upon completion of file writing, the encryption key + /// will be wiped out. + void WipeOutEncryptionKey() { key_.clear(); } bool is_utilized() { if (key_.empty()) @@ -146,6 +158,10 @@ class PARQUET_EXPORT ColumnEncryptionProperties { return utilized_; } + /// ColumnEncryptionProperties object can be used for writing one file only. + /// Mark ColumnEncryptionProperties as utilized once it is used in + /// FileEncryptionProperties as the encryption key will be wiped out upon + /// completion of file writing. void set_utilized() { utilized_ = true; } std::shared_ptr DeepClone() { @@ -172,7 +188,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT ColumnDecryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) @@ -202,9 +218,15 @@ class PARQUET_EXPORT ColumnDecryptionProperties { const std::string& key() const { return key_; } bool is_utilized() { return utilized_; } + /// ColumnDecryptionProperties object can be used for reading one file only. + /// Mark ColumnDecryptionProperties as utilized once it is used in + /// FileDecryptionProperties as the encryption key will be wiped out upon + /// completion of file reading. void set_utilized() { utilized_ = true; } - void wipeout_decryption_key(); + /// Upon completion of file reading, the encryption key + /// will be wiped out. + void WipeOutDecryptionKey(); std::shared_ptr DeepClone(); @@ -227,13 +249,13 @@ class PARQUET_EXPORT AADPrefixVerifier { /// Throws exception if an AAD prefix is wrong. /// In a data set, AAD Prefixes should be collected, /// and then checked for missing files. - virtual void check(const std::string& aad_prefix) = 0; + virtual void Verify(const std::string& aad_prefix) = 0; virtual ~AADPrefixVerifier() {} }; class PARQUET_EXPORT FileDecryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: Builder() { check_plaintext_footer_integrity_ = kDefaultCheckSignature; @@ -261,9 +283,7 @@ class PARQUET_EXPORT FileDecryptionProperties { /// If an explicit key is available for a footer or a column, /// its key metadata will be ignored. Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + const ColumnPathToDecryptionPropertiesMap& column_properties); /// Set a key retriever callback. Its also possible to /// set explicit footer or column keys on this file property object. @@ -314,11 +334,7 @@ class PARQUET_EXPORT FileDecryptionProperties { std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; + ColumnPathToDecryptionPropertiesMap column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -340,10 +356,15 @@ class PARQUET_EXPORT FileDecryptionProperties { return aad_prefix_verifier_; } - void wipeout_decryption_keys(); + /// Upon completion of file reading, the encryption keys in the properties + /// will be wiped out (array values set to 0). + void WipeOutDecryptionKeys(); bool is_utilized(); + /// FileDecryptionProperties object can be used for reading one file only. + /// Mark FileDecryptionProperties as utilized once it is used to read a file as the + /// encryption keys will be wiped out upon completion of file reading. void set_utilized() { utilized_ = true; } /// FileDecryptionProperties object can be used for reading one file only. @@ -361,30 +382,25 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier_; const std::string empty_string_ = ""; - - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; + ColumnPathToDecryptionPropertiesMap column_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; bool plaintext_files_allowed_; bool utilized_; - FileDecryptionProperties( - const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties, - bool plaintext_files_allowed); + FileDecryptionProperties(const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, + const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_properties, + bool plaintext_files_allowed); }; class PARQUET_EXPORT FileEncryptionProperties { public: - class PARQUET_EXPORT Builder { + class Builder { public: explicit Builder(const std::string& footer_key) : parquet_cipher_(kDefaultEncryptionAlgorithm), @@ -426,9 +442,7 @@ class PARQUET_EXPORT FileEncryptionProperties { /// If not called, all columns will be encrypted with the footer key. /// If called, the file columns not in the list will be left unencrypted. Builder* column_properties( - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + const ColumnPathToEncryptionPropertiesMap& column_properties); std::shared_ptr build() { return std::shared_ptr(new FileEncryptionProperties( @@ -444,10 +458,7 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool store_aad_prefix_in_file_; - std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath> - column_properties_; + ColumnPathToEncryptionPropertiesMap column_properties_; }; bool encrypted_footer() const { return encrypted_footer_; } @@ -464,9 +475,14 @@ class PARQUET_EXPORT FileEncryptionProperties { bool is_utilized() { return utilized_; } + /// FileEncryptionProperties object can be used for writing one file only. + /// Mark FileEncryptionProperties as utilized once it is used to write a file as the + /// encryption keys will be wiped out upon completion of file writing. void set_utilized() { utilized_ = true; } - void wipeout_encryption_keys(); + /// Upon completion of file writing, the encryption keys + /// will be wiped out (array values set to 0). + void WipeOutEncryptionKeys(); /// FileEncryptionProperties object can be used for writing one file only. /// (at the end, keys are wiped out in the memory). @@ -484,18 +500,12 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool utilized_; bool store_aad_prefix_in_file_; + ColumnPathToEncryptionPropertiesMap column_properties_; - std::map, - std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties_; - - FileEncryptionProperties( - ParquetCipher::type cipher, const std::string& footer_key, - const std::string& footer_key_metadata, bool encrypted_footer, - const std::string& aad_prefix, bool store_aad_prefix_in_file, - const std::map, - std::shared_ptr, - schema::ColumnPath::CmpColumnPath>& column_properties); + FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key, + const std::string& footer_key_metadata, bool encrypted_footer, + const std::string& aad_prefix, bool store_aad_prefix_in_file, + const ColumnPathToEncryptionPropertiesMap& column_properties); }; } // namespace parquet diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index bf3239d42c41..b73f910b2306 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -391,7 +391,7 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int AesEncryptor* AesEncryptor::Make( ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors) { + std::vector *all_encryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -399,9 +399,8 @@ AesEncryptor* AesEncryptor::Make( } AesEncryptor* encryptor = new AesEncryptor(alg_id, key_len, metadata); - if (all_encryptors != NULLPTR) { - all_encryptors->push_back(encryptor); - } + if (all_encryptors != NULLPTR) + all_encryptors->push_back(encryptor); return encryptor; } @@ -411,7 +410,7 @@ AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadat AesDecryptor* AesDecryptor::Make( ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors) { + std::vector *all_decryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index af668dc41366..410d1ed5472b 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -47,7 +47,7 @@ constexpr int8_t kOffsetIndex = 7; class AesEncryptor { public: static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors); + std::vector *all_encryptors); ~AesEncryptor(); @@ -78,7 +78,7 @@ class AesEncryptor { class AesDecryptor { public: static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors); + std::vector *all_decryptors); ~AesDecryptor(); void WipeOut(); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 6d27e2dc0035..fc3e7dbd1e36 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -125,17 +125,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.GetStream(source_, col_start, col_length); std::unique_ptr crypto_metadata = col->crypto_metadata(); - bool encrypted = true; - // Column is encrypted only if crypto_metadata exists. if (!crypto_metadata) { - encrypted = false; - } - - if (!encrypted) { + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), NULLPTR, NULLPTR}; return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i /* column_ordinal */); + properties_.memory_pool(), &ctx); } // The column is encrypted @@ -144,11 +139,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (crypto_metadata->encrypted_with_footer_key()) { auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); - + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i, meta_decryptor, - data_decryptor); + properties_.memory_pool(), &ctx); } // The column is encrypted with its own key @@ -161,10 +155,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { auto data_decryptor = file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); + PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool(), col->has_dictionary_page(), - row_group_ordinal_, (int16_t)i, meta_decryptor, - data_decryptor); + properties_.memory_pool(), &ctx); } private: @@ -196,7 +190,7 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { - if (file_decryptor_) file_decryptor_->wipeout_decryption_keys(); + if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); } std::shared_ptr GetRowGroup(int i) override { @@ -236,7 +230,7 @@ class SerializedFile : public ParquetFileReader::Contents { throw ParquetException("Invalid parquet file. Corrupt footer."); } - // no encryption or encryption with plaintext footer + // No encryption or encryption with plaintext footer mode. if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { uint32_t metadata_len = arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - @@ -265,178 +259,171 @@ class SerializedFile : public ParquetFileReader::Contents { file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); auto file_decryption_properties = properties_.file_decryption_properties(); - if (!file_metadata_->is_encryption_algorithm_set()) { // Plaintext file + if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. if (file_decryption_properties != NULLPTR) { if (!file_decryption_properties->plaintext_files_allowed()) { throw ParquetException("Applying decryption properties on plaintext file"); } } } else { - if (file_decryption_properties == NULLPTR) { - throw ParquetException("No decryption properties are provided"); - } + // Encrypted file with plaintext footer mode. + ParseMetaDataOfEncryptedFileWithPlaintextFooter( + file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); + } + } else { + // Encrypted file with Encrypted footer. + ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, + file_size); + } + } - // Handle AAD prefix - EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); - std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); - std::string aad_prefix = aad_prefix_in_properites; - bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; - std::string aad_prefix_in_file = algo.aad.aad_prefix; - - if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { - throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); - } + private: + std::shared_ptr source_; + std::shared_ptr file_metadata_; + ReaderProperties properties_; + std::unique_ptr file_decryptor_; - if (file_has_aad_prefix) { - if (!aad_prefix_in_properites.empty()) { - if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { - throw ParquetException( - "AAD Prefix in file and in properties " - "is not the same"); - } - } - aad_prefix = aad_prefix_in_file; - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { - throw ParquetException( - "AAD Prefix set in decryption properties, but was not used " - "for file encryption"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) { - throw ParquetException( - "AAD Prefix Verifier is set, but AAD Prefix not found in file"); - } - } - std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset(new InternalFileDecryptor( - file_decryption_properties, file_aad, algo.algorithm, - file_metadata_->footer_signing_key_metadata())); - - if (file_decryption_properties->check_plaintext_footer_integrity()) { - if (metadata_len - read_metadata_len != 28) { - throw ParquetException( - "Invalid parquet file. Cannot verify plaintext mode footer."); - } + std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, + EncryptionAlgorithm& algo); - auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify_signature( - encryptor, metadata_buffer->data() + read_metadata_len)) { - throw ParquetException( - "Invalid parquet file. Could not verify plaintext " - "footer metadata"); - } - } - } - } else { - // encryption with encrypted footer - // both metadata & crypto metadata length - uint32_t footer_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); - int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; - if (kFooterSize + footer_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } - std::shared_ptr crypto_metadata_buffer; + void ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len); - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (footer_len + kFooterSize)) { - crypto_metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); - if (crypto_metadata_buffer->size() != footer_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } - } - auto file_decryption_properties = properties_.file_decryption_properties(); - if (file_decryption_properties == nullptr) { + void ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size); +}; + +void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size) { + // encryption with encrypted footer + // both metadata & crypto metadata length + uint32_t footer_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t crypto_metadata_start = file_size - kFooterSize - footer_len; + if (kFooterSize + footer_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + std::shared_ptr crypto_metadata_buffer; + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (footer_len + kFooterSize)) { + crypto_metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - footer_len - kFooterSize, footer_len); + } else { + PARQUET_THROW_NOT_OK( + source_->ReadAt(crypto_metadata_start, footer_len, &crypto_metadata_buffer)); + if (crypto_metadata_buffer->size() != footer_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + auto file_decryption_properties = properties_.file_decryption_properties(); + if (file_decryption_properties == nullptr) { + throw ParquetException( + "No decryption properties are provided. Could not read " + "encrypted footer metadata"); + } + uint32_t crypto_metadata_len = footer_len; + std::shared_ptr file_crypto_metadata = + FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); + // Handle AAD prefix + EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, file_aad, + algo.algorithm, + file_crypto_metadata->key_metadata())); + int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; + uint32_t metadata_len = footer_len - crypto_metadata_len; + std::shared_ptr metadata_buffer; + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); + if (metadata_buffer->size() != metadata_len) { + throw ParquetException( + "Invalid encrypted parquet file. " + "Could not read footer metadata bytes."); + } + + auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); + file_metadata_ = + FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); +} + +void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( + FileDecryptionProperties* file_decryption_properties, + const std::shared_ptr& metadata_buffer, uint32_t metadata_len, + uint32_t read_metadata_len) { + // Providing decryption properties in plaintext footer mode is not mendatory, for + // example when reading by legacy reader. + if (file_decryption_properties != NULLPTR) { + EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); + // Handle AAD prefix + std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); + file_decryptor_.reset( + new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata())); + + if (file_decryption_properties->check_plaintext_footer_integrity()) { + if (metadata_len - read_metadata_len != 28) { throw ParquetException( - "No decryption properties are provided. Could not read " - "encrypted footer metadata"); + "Invalid parquet file. Cannot verify plaintext mode footer."); } - uint32_t crypto_metadata_len = footer_len; - std::shared_ptr file_crypto_metadata = - FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len); - EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); - - // Handle AAD prefix - std::string aad_prefix_in_properites = file_decryption_properties->aad_prefix(); - std::string aad_prefix = aad_prefix_in_properites; - bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; - std::string aad_prefix_in_file = algo.aad.aad_prefix; - - if (algo.aad.supply_aad_prefix && aad_prefix_in_properites.empty()) { + + auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); + if (!file_metadata_->verify_signature( + encryptor, metadata_buffer->data() + read_metadata_len)) { throw ParquetException( - "AAD prefix used for file encryption, " - "but not stored in file and not supplied " - "in decryption properties"); + "Invalid parquet file. Could not verify plaintext " + "footer metadata"); } + } + } +} - if (file_has_aad_prefix) { - if (!aad_prefix_in_properites.empty()) { - if (aad_prefix_in_properites.compare(aad_prefix_in_file) != 0) { - throw ParquetException( - "AAD Prefix in file and in properties " - "is not the same"); - } - } - aad_prefix = aad_prefix_in_file; - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->check(aad_prefix); - } else { - if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properites.empty()) { - throw ParquetException( - "AAD Prefix set in decryption properties, but was not used " - "for file encryption"); - } - std::shared_ptr aad_prefix_verifier = - file_decryption_properties->aad_prefix_verifier(); - if (aad_prefix_verifier != NULLPTR) { - throw ParquetException( - "AAD Prefix Verifier is set, but AAD Prefix not found in file"); - } - } - std::string file_aad = aad_prefix + algo.aad.aad_file_unique; - file_decryptor_.reset( - new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, - file_crypto_metadata->key_metadata())); - int64_t metadata_offset = - file_size - kFooterSize - footer_len + crypto_metadata_len; - uint32_t metadata_len = footer_len - crypto_metadata_len; - std::shared_ptr metadata_buffer; - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_offset, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { +std::string SerializedFile::HandleAadPrefix( + FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) { + std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix(); + std::string aad_prefix = aad_prefix_in_properties; + bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true; + std::string aad_prefix_in_file = algo.aad.aad_prefix; + + if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD prefix used for file encryption, " + "but not stored in file and not supplied " + "in decryption properties"); + } + + if (file_has_aad_prefix) { + if (!aad_prefix_in_properties.empty()) { + if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) { throw ParquetException( - "Invalid encrypted parquet file. " - "Could not read footer metadata bytes."); + "AAD Prefix in file and in properties " + "is not the same"); } - - auto footer_decryptor = file_decryptor_->GetFooterDecryptor(); - file_metadata_ = - FileMetaData::Make(metadata_buffer->data(), &metadata_len, footer_decryptor); + } + aad_prefix = aad_prefix_in_file; + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) aad_prefix_verifier->Verify(aad_prefix); + } else { + if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) { + throw ParquetException( + "AAD Prefix set in decryption properties, but was not used " + "for file encryption"); + } + std::shared_ptr aad_prefix_verifier = + file_decryption_properties->aad_prefix_verifier(); + if (aad_prefix_verifier != NULLPTR) { + throw ParquetException( + "AAD Prefix Verifier is set, but AAD Prefix not found in file"); } } - - private: - std::shared_ptr source_; - std::shared_ptr file_metadata_; - ReaderProperties properties_; - std::unique_ptr file_decryptor_; -}; + return aad_prefix + algo.aad.aad_file_unique; +} // ---------------------------------------------------------------------- // ParquetFileReader public API @@ -520,9 +507,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) - << "The file only has " << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) << "The file only has " + << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 4eb491f9a661..bfcf636301b5 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -136,7 +136,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, - row_group_ordinal_, (int16_t)(current_column_index_ - 1), + row_group_ordinal_, static_cast(current_column_index_ - 1), properties_->memory_pool(), false, meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -240,11 +240,11 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto data_encryptor = file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) : NULLPTR; - std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, (int16_t)row_group_ordinal_, - (int16_t)current_column_index_, properties_->memory_pool(), - buffered_row_group_, meta_encryptor, data_encryptor); + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + static_cast(row_group_ordinal_), + static_cast(current_column_index_), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -286,38 +286,11 @@ class FileSerializer : public ParquetFileWriter::Contents { // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); - if (file_encryption_properties == nullptr) { // plaintext regular file + if (file_encryption_properties == nullptr) { // Non encrypted file. file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); } else { // Encrypted file - // Encrypted file with encrypted footer - if (file_encryption_properties->encrypted_footer()) { - // encrypted footer - file_metadata_ = metadata_->Finish(); - - int64_t position = -1; - PARQUET_THROW_NOT_OK(sink_->Tell(&position)); - uint64_t metadata_start = static_cast(position); - auto crypto_metadata = metadata_->GetCryptoMetaData(); - WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); - - auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_encryptor, true); - PARQUET_THROW_NOT_OK(sink_->Tell(&position)); - uint32_t footer_and_crypto_len = - static_cast(position - metadata_start); - PARQUET_THROW_NOT_OK( - sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); - PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); - } else { // Encrypted file with plaintext footer - file_metadata_ = metadata_->Finish(); - auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); - WriteFileMetaData(*file_metadata_, sink_.get(), footer_signing_encryptor, - false); - } - if (file_encryptor_) { - file_encryptor_->wipeout_encryption_keys(); - } + CloseEncryptedFile(file_encryption_properties); } } } @@ -339,7 +312,7 @@ class FileSerializer : public ParquetFileWriter::Contents { num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, (int16_t)(num_row_groups_ - 1), properties_.get(), + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); @@ -372,6 +345,36 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } + void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { + // Encrypted file with encrypted footer + if (file_encryption_properties->encrypted_footer()) { + // encrypted footer + file_metadata_ = metadata_->Finish(); + + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint64_t metadata_start = static_cast(position); + auto crypto_metadata = metadata_->GetCryptoMetaData(); + WriteFileCryptoMetaData(*crypto_metadata, sink_.get()); + + auto footer_encryptor = file_encryptor_->GetFooterEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true); + PARQUET_THROW_NOT_OK(sink_->Tell(&position)); + uint32_t footer_and_crypto_len = static_cast(position - metadata_start); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&footer_and_crypto_len), 4)); + PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); + } else { // Encrypted file with plaintext footer + file_metadata_ = metadata_->Finish(); + auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor(); + WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor, + false); + } + if (file_encryptor_) { + file_encryptor_->WipeOutEncryptionKeys(); + } + } + std::shared_ptr sink_; bool is_open_; const std::shared_ptr properties_; @@ -393,7 +396,7 @@ class FileSerializer : public ParquetFileWriter::Contents { if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { - // plaintext mode footer + // Encrypted file with plaintext footer mode. PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } @@ -432,43 +435,52 @@ std::unique_ptr ParquetFileWriter::Open( key_value_metadata); } -void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, - const std::shared_ptr& encryptor, bool encrypt_footer) { - if (encryptor == nullptr) { // plaintext regular file - // Write MetaData +void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + // Write MetaData + int64_t position = -1; + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + uint32_t metadata_len = static_cast(position); + + file_metadata.WriteTo(sink); + PARQUET_THROW_NOT_OK(sink->Tell(&position)); + metadata_len = static_cast(position) - metadata_len; + + // Write Footer + PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); +} + +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + if (encrypt_footer) { // Encrypted file with encrypted footer + // encrypt and write to sink + file_metadata.WriteTo(sink, encryptor); + } else { // Encrypted file with plaintext footer mode. int64_t position = -1; PARQUET_THROW_NOT_OK(sink->Tell(&position)); uint32_t metadata_len = static_cast(position); - - file_metadata.WriteTo(sink); + file_metadata.WriteTo(sink, encryptor); PARQUET_THROW_NOT_OK(sink->Tell(&position)); metadata_len = static_cast(position) - metadata_len; - // Write Footer PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } else { // Encrypted file - if (encrypt_footer) { // Encrypted file with encrypted footer - // encrypt and write to sink - file_metadata.WriteTo(sink, encryptor); - } else { // Encrypted file with plaintext footer - int64_t position = -1; - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - uint32_t metadata_len = static_cast(position); - file_metadata.WriteTo(sink, encryptor); - PARQUET_THROW_NOT_OK(sink->Tell(&position)); - metadata_len = static_cast(position) - metadata_len; - - PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast(&metadata_len), 4)); - PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - } } } void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, const std::shared_ptr& encryptor, bool encrypt_footer) { ParquetOutputWrapper wrapper(sink); - return WriteFileMetaData(file_metadata, &wrapper, encryptor, encrypt_footer); + return WriteFileMetaData(file_metadata, &wrapper); +} + +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, OutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer) { + ParquetOutputWrapper wrapper(sink); + return WriteEncryptedFileMetadata(file_metadata, &wrapper, encryptor, encrypt_footer); } void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index a1ec8183647f..51e9da7757b3 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -98,16 +98,24 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR, - bool encrypt_footer = false); +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ArrowOutputStream* sink, + const std::shared_ptr& encryptor, + bool encrypt_footer); + void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink, - const std::shared_ptr& encryptor = NULLPTR, - bool encrypt_footer = false); +void WriteFileMetaData(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); +PARQUET_EXPORT +void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink, + const std::shared_ptr& encryptor = NULLPTR, + bool encrypt_footer = false); PARQUET_EXPORT void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ::arrow::io::OutputStream* sink); diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index d46101a819b6..867c81a0113f 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -22,12 +22,6 @@ namespace parquet { // FooterSigningEncryptor -static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); -} FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, @@ -78,25 +72,11 @@ InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* propertie "Re-using decryption properties with explicit keys for another file"); } properties_->set_utilized(); - - all_decryptors_ = std::shared_ptr>( - new std::vector); - column_data_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); } -void InternalFileDecryptor::wipeout_decryption_keys() { - properties_->wipeout_decryption_keys(); - for (auto const& i : *all_decryptors_) { +void InternalFileDecryptor::WipeOutDecryptionKeys() { + properties_->WipeOutDecryptionKeys(); + for (auto const& i : all_decryptors_) { i->WipeOut(); } } @@ -180,16 +160,13 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); - std::shared_ptr footer_metadata_decryptor = + footer_metadata_decryptor_ = std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); - std::shared_ptr footer_data_decryptor = + footer_data_decryptor_ = std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); - footer_metadata_decryptor_ = footer_metadata_decryptor; - footer_data_decryptor_ = footer_data_decryptor; - - if (metadata) return footer_metadata_decryptor; - return footer_data_decryptor; + if (metadata) return footer_metadata_decryptor_; + return footer_data_decryptor_; } std::shared_ptr InternalFileDecryptor::GetColumnMetaDecryptor( @@ -210,12 +187,12 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( std::string column_key; // first look if we already got the decryptor from before if (metadata) { - if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { - return column_metadata_map_->at(column_path); + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); } } else { - if (column_data_map_->find(column_path) != column_data_map_->end()) { - return column_data_map_->at(column_path); + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); } } @@ -242,64 +219,43 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); - std::shared_ptr metadata_decryptor = + column_metadata_map_[column_path] = std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); - std::shared_ptr data_decryptor = + column_data_map_[column_path] = std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); - (*column_metadata_map_)[column_path] = metadata_decryptor; - (*column_data_map_)[column_path] = data_decryptor; + if (metadata) return column_metadata_map_[column_path]; + return column_data_map_[column_path]; +} - if (metadata) return metadata_decryptor; - return data_decryptor; +int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("decryption key must be 16, 24 or 32 bytes in length"); } encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (meta_decryptor_128_ == NULLPTR) { - meta_decryptor_128_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); - } - return meta_decryptor_128_.get(); - } else if (key_len == 24) { - if (meta_decryptor_196_ == NULLPTR) { - meta_decryptor_196_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); - } - return meta_decryptor_196_.get(); - } else if (key_len == 32) { - if (meta_decryptor_256_ == NULLPTR) { - meta_decryptor_256_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, true, all_decryptors_)); - } - return meta_decryptor_256_.get(); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (meta_decryptor_[index] == NULLPTR) { + meta_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return meta_decryptor_[index].get(); } encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (data_decryptor_128_ == NULLPTR) { - data_decryptor_128_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); - } - return data_decryptor_128_.get(); - } else if (key_len == 24) { - if (data_decryptor_196_ == NULLPTR) { - data_decryptor_196_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); - } - return data_decryptor_196_.get(); - } else if (key_len == 32) { - if (data_decryptor_256_ == NULLPTR) { - data_decryptor_256_.reset( - encryption::AesDecryptor::Make(algorithm_, key_len, false, all_decryptors_)); - } - return data_decryptor_256_.get(); + int index = MapKeyLenToDecryptorArrayIndex(key_len); + if (data_decryptor_[index] == NULLPTR) { + data_decryptor_[index].reset( + encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return data_decryptor_[index].get(); } } // namespace parquet diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 3013183bf945..842eea7c6808 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -56,7 +56,7 @@ class Decryptor { const std::string& file_aad, const std::string& aad); const std::string& file_aad() const { return file_aad_; } - void update_aad(const std::string& aad) { aad_ = aad; } + void UpdateAad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); @@ -85,7 +85,7 @@ class InternalFileDecryptor { FileDecryptionProperties* properties() { return properties_; } - void wipeout_decryption_keys(); + void WipeOutDecryptionKeys(); std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); @@ -101,13 +101,11 @@ class InternalFileDecryptor { FileDecryptionProperties* properties_; // Concatenation of aad_prefix (if exists) and aad_file_unique std::string file_aad_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_data_map_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_metadata_map_; std::shared_ptr footer_metadata_decryptor_; @@ -115,14 +113,12 @@ class InternalFileDecryptor { ParquetCipher::type algorithm_; std::string footer_key_metadata_; std::shared_ptr footer_signing_encryptor_; - std::shared_ptr> all_decryptors_; + std::vector all_decryptors_; - std::unique_ptr meta_decryptor_128_; - std::unique_ptr meta_decryptor_196_; - std::unique_ptr meta_decryptor_256_; - std::unique_ptr data_decryptor_128_; - std::unique_ptr data_decryptor_196_; - std::unique_ptr data_decryptor_256_; + /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_decryptors and data_decryptors. + std::unique_ptr meta_decryptor_[3]; + std::unique_ptr data_decryptor_[3]; std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( @@ -132,6 +128,8 @@ class InternalFileDecryptor { encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size); encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size); + + int MapKeyLenToDecryptorArrayIndex(int key_len); }; } // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index b558f5c7fb84..6f097a1a0b4d 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -21,13 +21,6 @@ namespace parquet { -static inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return NULLPTR; - - char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); -} - // Encryptor Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad) @@ -44,26 +37,16 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip // InternalFileEncryptor InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) : properties_(properties) { - all_encryptors_ = std::shared_ptr>( - new std::vector); - - column_data_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); - - column_metadata_map_ = std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>>( - new std::map, std::shared_ptr, - schema::ColumnPath::CmpColumnPath>()); + if (properties_->is_utilized()) { + throw ParquetException("Re-using encryption properties for another file"); + } + properties_->set_utilized(); } -void InternalFileEncryptor::wipeout_encryption_keys() { - properties_->wipeout_encryption_keys(); +void InternalFileEncryptor::WipeOutEncryptionKeys() { + properties_->WipeOutEncryptionKeys(); - for (auto const& i : *all_encryptors_) { + for (auto const& i : all_encryptors_) { i->WipeOut(); } } @@ -77,10 +60,9 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); - std::shared_ptr encryptor = std::make_shared( - aes_encryptor, footer_key, properties_->file_aad(), footer_aad); - footer_encryptor_ = encryptor; - return encryptor; + footer_encryptor_ = std::make_shared(aes_encryptor, footer_key, + properties_->file_aad(), footer_aad); + return footer_encryptor_; } std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { @@ -92,10 +74,9 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); - std::shared_ptr encryptor = std::make_shared( + footer_signing_encryptor_ = std::make_shared( aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); - footer_signing_encryptor_ = encryptor; - return encryptor; + return footer_signing_encryptor_; } std::shared_ptr InternalFileEncryptor::GetColumnMetaEncryptor( @@ -113,17 +94,17 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata) { // first look if we already got the encryptor from before if (metadata) { - if (column_metadata_map_->find(column_path) != column_metadata_map_->end()) { - return column_metadata_map_->at(column_path); + if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) { + return column_metadata_map_.at(column_path); } } else { - if (column_data_map_->find(column_path) != column_data_map_->end()) { - return column_data_map_->at(column_path); + if (column_data_map_.find(column_path) != column_data_map_.end()) { + return column_data_map_.at(column_path); } } auto column_prop = properties_->column_properties(column_path); if (column_prop == NULLPTR) { - return NULLPTR; + return nullptr; } std::string key; @@ -141,61 +122,43 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( std::shared_ptr encryptor = std::make_shared(aes_encryptor, key, file_aad, ""); if (metadata) - (*column_metadata_map_)[column_path] = encryptor; + column_metadata_map_[column_path] = encryptor; else - (*column_data_map_)[column_path] = encryptor; + column_data_map_[column_path] = encryptor; return encryptor; } +int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) { + if (key_len == 16) + return 0; + else if (key_len == 24) + return 1; + else if (key_len == 32) + return 2; + throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); +} + encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (meta_encryptor_128_ == NULLPTR) { - meta_encryptor_128_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); - } - return meta_encryptor_128_.get(); - } else if (key_len == 24) { - if (meta_encryptor_196_ == NULLPTR) { - meta_encryptor_196_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); - } - return meta_encryptor_196_.get(); - } else if (key_len == 32) { - if (meta_encryptor_256_ == NULLPTR) { - meta_encryptor_256_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, true, all_encryptors_)); - } - return meta_encryptor_256_.get(); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (meta_encryptor_[index] == NULLPTR) { + meta_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return meta_encryptor_[index].get(); } encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor( ParquetCipher::type algorithm, size_t key_size) { int key_len = static_cast(key_size); - if (key_len == 16) { - if (data_encryptor_128_ == NULLPTR) { - data_encryptor_128_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); - } - return data_encryptor_128_.get(); - } else if (key_len == 24) { - if (data_encryptor_196_ == NULLPTR) { - data_encryptor_196_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); - } - return data_encryptor_196_.get(); - } else if (key_len == 32) { - if (data_encryptor_256_ == NULLPTR) { - data_encryptor_256_.reset( - encryption::AesEncryptor::Make(algorithm, key_len, false, all_encryptors_)); - } - return data_encryptor_256_.get(); + int index = MapKeyLenToEncryptorArrayIndex(key_len); + if (data_encryptor_[index] == NULLPTR) { + data_encryptor_[index].reset( + encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_)); } - throw ParquetException("encryption key must be 16, 24 or 32 bytes in length"); + return data_encryptor_[index].get(); } } // namespace parquet diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 9fc0227e45f0..efef532b1639 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -40,12 +40,12 @@ class Encryptor { Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad); const std::string& file_aad() { return file_aad_; } - void update_aad(const std::string& aad) { aad_ = aad; } + void UpdateAad(const std::string& aad) { aad_ = aad; } int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); - bool encryptColumnMetaData( + bool EncryptColumnMetaData( bool encrypted_footer, const std::shared_ptr& column_encryption_properties) { // if column is not encrypted then do not encrypt the column metadata @@ -74,31 +74,27 @@ class InternalFileEncryptor { const std::shared_ptr& column_path); std::shared_ptr GetColumnDataEncryptor( const std::shared_ptr& column_path); - void wipeout_encryption_keys(); + void WipeOutEncryptionKeys(); private: FileEncryptionProperties* properties_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_data_map_; - std::shared_ptr< - std::map, std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath>> + std::map, std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> column_metadata_map_; std::shared_ptr footer_signing_encryptor_; std::shared_ptr footer_encryptor_; - std::shared_ptr> all_encryptors_; + std::vector all_encryptors_; - std::unique_ptr meta_encryptor_128_; - std::unique_ptr meta_encryptor_196_; - std::unique_ptr meta_encryptor_256_; - std::unique_ptr data_encryptor_128_; - std::unique_ptr data_encryptor_196_; - std::unique_ptr data_encryptor_256_; + // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three + // types of meta_encryptors and data_encryptors. + std::unique_ptr meta_encryptor_[3]; + std::unique_ptr data_encryptor_[3]; std::shared_ptr GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata); @@ -107,6 +103,8 @@ class InternalFileEncryptor { size_t key_len); encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm, size_t key_len); + + int MapKeyLenToEncryptorArrayIndex(int key_len); }; } // namespace parquet diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e386eb91def9..3cb962cd022d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -199,7 +199,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { uint32_t len = static_cast(column->encrypted_column_metadata.size()); DeserializeThriftMsg( reinterpret_cast(column->encrypted_column_metadata.c_str()), - &len, &decrypted_metadata_, decryptor, false); + &len, &decrypted_metadata_, decryptor); is_metadata_set_ = true; } } else { @@ -479,7 +479,7 @@ class FileMetaData::FileMetaDataImpl { : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, - metadata_.get(), decryptor, false); + metadata_.get(), decryptor); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -564,7 +564,7 @@ class FileMetaData::FileMetaDataImpl { encryption::kGcmTagLength)); } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer - serializer.Serialize(metadata_.get(), dst, encryptor, false); + serializer.Serialize(metadata_.get(), dst, encryptor); } } diff --git a/cpp/src/parquet/parquet.pc b/cpp/src/parquet/parquet.pc deleted file mode 100644 index e46eea65b729..000000000000 --- a/cpp/src/parquet/parquet.pc +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -prefix=/usr/local -libdir=${prefix}/lib -includedir=${prefix}/include - -so_version=13 -abi_version=13 -full_so_version=13.0.0 - -Name: Apache Parquet -Description: Apache Parquet is a columnar storage format. -Version: 1.5.1-SNAPSHOT -Libs: -L${libdir} -lparquet -Cflags: -I${includedir} diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index b6795e3d4874..288c72a60a64 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -874,7 +874,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -968,5 +968,3 @@ struct FileCryptoMetaData { * and (possibly) columns **/ 2: optional binary key_metadata } - - diff --git a/cpp/src/parquet/parquet_version.h b/cpp/src/parquet/parquet_version.h deleted file mode 100644 index dd83e45028df..000000000000 --- a/cpp/src/parquet/parquet_version.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_VERSION_H -#define PARQUET_VERSION_H - -// define the parquet created by version -#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT" - -#endif // PARQUET_VERSION_H diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 709f17970b0a..1e5fdc3d84db 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -164,12 +164,9 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { format::AesGcmV1 aesGcmV1; // aad_file_unique is always set - aesGcmV1.__isset.aad_file_unique = true; aesGcmV1.__set_aad_file_unique(aad.aad_file_unique); - aesGcmV1.__isset.supply_aad_prefix = true; aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix); if (!aad.aad_prefix.empty()) { - aesGcmV1.__isset.aad_prefix = true; aesGcmV1.__set_aad_prefix(aad.aad_prefix); } return aesGcmV1; @@ -178,12 +175,9 @@ static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) { static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { format::AesGcmCtrV1 aesGcmCtrV1; // aad_file_unique is always set - aesGcmCtrV1.__isset.aad_file_unique = true; aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique); - aesGcmCtrV1.__isset.supply_aad_prefix = true; aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix); if (!aad.aad_prefix.empty()) { - aesGcmCtrV1.__isset.aad_prefix = true; aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix); } return aesGcmCtrV1; @@ -192,11 +186,9 @@ static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) { static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) { format::EncryptionAlgorithm encryption_algorithm; if (encryption.algorithm == ParquetCipher::AES_GCM_V1) { - encryption_algorithm.__isset.AES_GCM_V1 = true; - encryption_algorithm.AES_GCM_V1 = ToAesGcmV1Thrift(encryption.aad); + encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad)); } else { - encryption_algorithm.__isset.AES_GCM_CTR_V1 = true; - encryption_algorithm.AES_GCM_CTR_V1 = ToAesGcmCtrV1Thrift(encryption.aad); + encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad)); } return encryption_algorithm; } @@ -206,50 +198,51 @@ static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryptio using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; +template +inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, + T* deserialized_msg) { + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; +} + // Deserialize a thrift message from buf/len. buf/len must at least contain // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& decryptor = NULLPTR, - bool shouldReadLength = false) { + const std::shared_ptr& decryptor = NULLPTR) { // thrift message is not encrypted if (decryptor == NULLPTR) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new ThriftBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; - shared_ptr tproto = // - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); } else { // thrift message is encrypted uint32_t clen; - if (shouldReadLength) { - // first 4 bytes for length - uint8_t clenBytes[4]; - memcpy(clenBytes, buf, 4); - clen = *(reinterpret_cast(clenBytes)); - } else { - clen = *len; - } + clen = *len; // decrypt - const uint8_t* cipherBuf = shouldReadLength ? &buf[4] : buf; - std::vector decrypted_buffer(clen - decryptor->CiphertextSizeDelta()); + std::shared_ptr decrypted_buffer = + std::static_pointer_cast(AllocateBuffer( + ::arrow::default_memory_pool(), + static_cast(clen - decryptor->CiphertextSizeDelta()))); + const uint8_t* cipher_buf = buf; uint32_t decrypted_buffer_len = - decryptor->Decrypt(cipherBuf, 0, decrypted_buffer.data()); + decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data()); if (decrypted_buffer_len <= 0) { throw ParquetException("Couldn't decrypt buffer\n"); } *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); - DeserializeThriftMsg(decrypted_buffer.data(), &decrypted_buffer_len, + DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, deserialized_msg); } } @@ -283,8 +276,7 @@ class ThriftSerializer { template int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryptor = NULLPTR, - bool shouldWriteLength = false) { + const std::shared_ptr& encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); @@ -294,26 +286,7 @@ class ThriftSerializer { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); return static_cast(out_length); } else { // obj is encrypted - std::vector cipher_buffer(encryptor->CiphertextSizeDelta() + out_length); - int cipher_buffer_len = - encryptor->Encrypt(out_buffer, out_length, cipher_buffer.data()); - - if (cipher_buffer_len > static_cast(cipher_buffer.size())) { - std::stringstream ss; - ss << "cipher length is greater than cipher buffer capacity: " - << cipher_buffer_len << cipher_buffer.size() << "\n"; - throw ParquetException(ss.str()); - } - - if (shouldWriteLength) { - PARQUET_THROW_NOT_OK( - out->Write(reinterpret_cast(&cipher_buffer_len), 4)); - PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); - return static_cast(cipher_buffer_len + 4); - } else { - PARQUET_THROW_NOT_OK(out->Write(cipher_buffer.data(), cipher_buffer_len)); - return static_cast(cipher_buffer_len); - } + return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); } } @@ -330,6 +303,20 @@ class ThriftSerializer { } } + int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, + uint32_t out_length, + const std::shared_ptr& encryptor) { + std::shared_ptr cipher_buffer = + std::static_pointer_cast(AllocateBuffer( + ::arrow::default_memory_pool(), + static_cast(encryptor->CiphertextSizeDelta() + out_length))); + int cipher_buffer_len = + encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); + + PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); + return static_cast(cipher_buffer_len); + } + shared_ptr mem_buffer_; shared_ptr protocol_; }; From 9397cd2da04519303355216403d0204f133cb1bf Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Fri, 21 Jun 2019 16:23:17 +0300 Subject: [PATCH 112/201] Fix SerializedPageReader initialization --- cpp/src/parquet/column_reader.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 04b48f0af1f8..ed5c2bb4976f 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -122,11 +122,16 @@ class SerializedPageReader : public PageReader { ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), + column_has_dictionary_(false), first_page_(true), + row_group_ordinal_(-1), + column_ordinal_(-1), page_ordinal_(-1), seen_num_rows_(0), total_num_rows_(total_num_rows), - decryption_buffer_(AllocateBuffer(pool, 0)) { + decryption_buffer_(AllocateBuffer(pool, 0)), + meta_decryptor_(NULLPTR), + data_decryptor_(NULLPTR) { if (ctx != NULLPTR) { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; From 6e5d7ecc9cc414233e236e52444091dedeccb499 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Sun, 23 Jun 2019 11:46:35 +0300 Subject: [PATCH 113/201] Fix Format --- cpp/src/parquet/encryption_internal.cc | 13 +++++-------- cpp/src/parquet/encryption_internal.h | 4 ++-- cpp/src/parquet/file_reader.cc | 12 ++++++------ cpp/src/parquet/metadata.cc | 4 ++-- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index b73f910b2306..695b9b5db723 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -389,9 +389,8 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int } } -AesEncryptor* AesEncryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_encryptors) { +AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_encryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; @@ -399,8 +398,7 @@ AesEncryptor* AesEncryptor::Make( } AesEncryptor* encryptor = new AesEncryptor(alg_id, key_len, metadata); - if (all_encryptors != NULLPTR) - all_encryptors->push_back(encryptor); + if (all_encryptors != NULLPTR) all_encryptors->push_back(encryptor); return encryptor; } @@ -408,9 +406,8 @@ AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadat : impl_{std::unique_ptr( new AesDecryptorImpl(alg_id, key_len, metadata))} {} -AesDecryptor* AesDecryptor::Make( - ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_decryptors) { +AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_decryptors) { if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) { std::stringstream ss; ss << "Crypto algorithm " << alg_id << " is not supported"; diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index 410d1ed5472b..9fe82bd28da8 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -47,7 +47,7 @@ constexpr int8_t kOffsetIndex = 7; class AesEncryptor { public: static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_encryptors); + std::vector* all_encryptors); ~AesEncryptor(); @@ -78,7 +78,7 @@ class AesEncryptor { class AesDecryptor { public: static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::vector *all_decryptors); + std::vector* all_decryptors); ~AesDecryptor(); void WipeOut(); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index fc3e7dbd1e36..a6e6abbd3d7a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -58,9 +58,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -70,9 +70,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) - << "The RowGroup only has " << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " + << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 3cb962cd022d..9ac7cf9c5edf 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -307,8 +307,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { - if (column_->__isset.crypto_metadata - && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + if (column_->__isset.crypto_metadata && + column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { throw ParquetException( "Cannot decrypt ColumnMetadata. " From fb7ac18220567370b01d75ba6bacbe9e41f9128e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sun, 23 Jun 2019 22:42:35 +0700 Subject: [PATCH 114/201] let parquet encryption be able to be off (when openssl is not found) --- cpp/examples/parquet/CMakeLists.txt | 26 +++-- cpp/src/parquet/CMakeLists.txt | 16 ++- cpp/src/parquet/column_reader.cc | 56 ++++++--- cpp/src/parquet/column_reader.h | 6 + cpp/src/parquet/column_writer.cc | 87 ++++++++++++-- cpp/src/parquet/column_writer.h | 7 ++ cpp/src/parquet/file_reader.cc | 138 ++++++++++++++++------ cpp/src/parquet/file_writer.cc | 72 ++++++++++-- cpp/src/parquet/file_writer.h | 19 ++-- cpp/src/parquet/metadata.cc | 171 ++++++++++++++++++++++++---- cpp/src/parquet/metadata.h | 75 ++++++++++-- cpp/src/parquet/properties.h | 45 ++++++-- cpp/src/parquet/thrift.h | 36 ++++-- 13 files changed, 618 insertions(+), 136 deletions(-) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index fb4287303607..813484641fe4 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -17,16 +17,20 @@ add_executable(parquet-low-level-example low-level-api/reader-writer.cc) add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) -add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) -add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) -target_include_directories(parquet-encryption-example PRIVATE low-level-api/) -target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) -target_link_libraries(parquet-encryption-example parquet_static) -target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) + +if (ARROW_USE_OPENSSL) + add_definitions(-DPARQUET_ENCRYPTION) + add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) + add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) + target_include_directories(parquet-encryption-example PRIVATE low-level-api/) + target_include_directories(parquet-encryption-example-all-crypto-options PRIVATE low-level-api/) + target_link_libraries(parquet-encryption-example parquet_static) + target_link_libraries(parquet-encryption-example-all-crypto-options parquet_static) +endif() add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) # Prefer shared linkage but use static if shared build is deactivated @@ -39,6 +43,10 @@ endif() add_dependencies(parquet parquet-low-level-example parquet-low-level-example2 - parquet-encryption-example - parquet-arrow-example - parquet-encryption-example-all-crypto-options) + parquet-arrow-example) + +if (ARROW_USE_OPENSSL) + add_dependencies(parquet + parquet-encryption-example + parquet-encryption-example-all-crypto-options) +endif() diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index c715b0c4684d..32888014080a 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -167,10 +167,6 @@ set(PARQUET_SRCS column_writer.cc deprecated_io.cc encoding.cc - encryption.cc - encryption_internal.cc - internal_file_decryptor.cc - internal_file_encryptor.cc file_reader.cc file_writer.cc metadata.cc @@ -184,6 +180,16 @@ set(PARQUET_SRCS statistics.cc types.cc) +if (ARROW_USE_OPENSSL) + add_definitions(-DPARQUET_ENCRYPTION) + set(PARQUET_SRCS + ${PARQUET_SRCS} + encryption.cc + encryption_internal.cc + internal_file_decryptor.cc + internal_file_encryptor.cc) +endif() + # Ensure that thrift compilation is done before using its generated headers # in parquet code. add_custom_target(parquet-thrift-deps ALL DEPENDS ${THRIFT_OUTPUT_FILES}) @@ -321,7 +327,7 @@ add_parquet_test(arrow-test arrow/arrow_schema_test.cc test_util.cc) -if(PARQUET_BUILD_ENCRYPTION) +if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES encryption-properties-test.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ed5c2bb4976f..4b831a8d3b72 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -40,12 +40,15 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" // IWYU pragma: keep +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#endif + using arrow::MemoryPool; using arrow::internal::checked_cast; @@ -128,22 +131,28 @@ class SerializedPageReader : public PageReader { column_ordinal_(-1), page_ordinal_(-1), seen_num_rows_(0), - total_num_rows_(total_num_rows), + total_num_rows_(total_num_rows) +#ifdef PARQUET_ENCRYPTION + , decryption_buffer_(AllocateBuffer(pool, 0)), meta_decryptor_(NULLPTR), - data_decryptor_(NULLPTR) { + data_decryptor_(NULLPTR) +#endif + { if (ctx != NULLPTR) { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; column_ordinal_ = ctx->column_ordinal; +#ifdef PARQUET_ENCRYPTION meta_decryptor_ = ctx->meta_decryptor; data_decryptor_ = ctx->data_decryptor; + if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { + InitDecryption(); + } +#endif } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); - if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { - InitDecryption(); - } } // Implement the PageReader interface @@ -152,11 +161,13 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: +#ifdef PARQUET_ENCRYPTION void UpdateDecryption(const std::shared_ptr& decryptor, bool current_page_is_dictionary, int8_t module_type, const std::string& pageAAD); void InitDecryption(); +#endif std::shared_ptr stream_; @@ -188,12 +199,6 @@ class SerializedPageReader : public PageReader { int16_t row_group_ordinal_; int16_t column_ordinal_; int16_t page_ordinal_; - // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page - // header in a single column respectively. - // While calculating AAD for different pages in a single column the pages AAD is - // updated by only the page ordinal. - std::string data_pageAAD_; - std::string data_page_headerAAD_; // Maximum allowed page size uint32_t max_page_header_size_; @@ -204,12 +209,21 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; +#ifdef PARQUET_ENCRYPTION + // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page + // header in a single column respectively. + // While calculating AAD for different pages in a single column the pages AAD is + // updated by only the page ordinal. + std::string data_pageAAD_; + std::string data_page_headerAAD_; // Encryption std::shared_ptr decryption_buffer_; std::shared_ptr meta_decryptor_; std::shared_ptr data_decryptor_; +#endif }; +#ifdef PARQUET_ENCRYPTION void SerializedPageReader::InitDecryption() { // Prepare the AAD for quick update later. if (data_decryptor_ != NULLPTR) { @@ -241,14 +255,19 @@ void SerializedPageReader::UpdateDecryption(const std::shared_ptr& de decryptor->UpdateAad(pageAAD); } } +#endif // PARQUET_ENCRYPTION std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with +#ifdef PARQUET_ENCRYPTION bool current_page_is_dictionary = false; +#endif if (column_has_dictionary_) { if (first_page_) { +#ifdef PARQUET_ENCRYPTION current_page_is_dictionary = true; +#endif first_page_ = false; } else { page_ordinal_++; @@ -274,12 +293,17 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { +#ifdef PARQUET_ENCRYPTION if (meta_decryptor_ != NULLPTR) { UpdateDecryption(meta_decryptor_, current_page_is_dictionary, encryption::kDictionaryPageHeader, data_page_headerAAD_); } DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, meta_decryptor_); +#else + DeserializeThriftMsg(reinterpret_cast(buffer.data()), + &header_size, ¤t_page_header_); +#endif // PARQUET_ENCRYPTION break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -297,11 +321,12 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; +#ifdef PARQUET_ENCRYPTION if (data_decryptor_ != NULLPTR) { UpdateDecryption(data_decryptor_, current_page_is_dictionary, encryption::kDictionaryPage, data_pageAAD_); } - +#endif // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -312,6 +337,7 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } +#ifdef PARQUET_ENCRYPTION // Decrypt it if we need to if (data_decryptor_ != nullptr) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( @@ -321,7 +347,7 @@ std::shared_ptr SerializedPageReader::NextPage() { page_buffer = decryption_buffer_; } - +#endif // PARQUET_ENCRYPTION // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 6a180ca9b7e4..1298b15b62db 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -44,7 +44,10 @@ class RleDecoder; namespace parquet { class Page; + +#ifdef PARQUET_ENCRYPTION class Decryptor; +#endif // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -77,8 +80,11 @@ struct PageReaderContext { bool column_has_dictionary; int16_t row_group_ordinal; int16_t column_ordinal; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr meta_decryptor; std::shared_ptr data_decryptor; +#endif }; // Abstract page iterator interface. This way, we can feed column pages to the diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 1ae33c624999..91918638dab9 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -38,8 +38,6 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -48,6 +46,11 @@ #include "parquet/thrift.h" #include "parquet/types.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" +#endif + namespace parquet { using arrow::Status; @@ -141,9 +144,13 @@ class SerializedPageWriter : public PageWriter { SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, - MemoryPool* pool = arrow::default_memory_pool(), + MemoryPool* pool = arrow::default_memory_pool() +#ifdef PARQUET_ENCRYPTION + , std::shared_ptr meta_encryptor = NULLPTR, - std::shared_ptr data_encryptor = NULLPTR) + std::shared_ptr data_encryptor = NULLPTR +#endif + ) : sink_(sink), metadata_(metadata), pool_(pool), @@ -154,12 +161,18 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_(0), page_ordinal_(0), row_group_ordinal_(row_group_ordinal), - column_ordinal_(column_chunk_ordinal), + column_ordinal_(column_chunk_ordinal) +#ifdef PARQUET_ENCRYPTION + , meta_encryptor_(meta_encryptor), - data_encryptor_(data_encryptor) { + data_encryptor_(data_encryptor) +#endif + { +#ifdef PARQUET_ENCRYPTION if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { InitEncryption(); } +#endif compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -184,6 +197,7 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int32_t output_data_len = static_cast(compressed_data->size()); +#ifdef PARQUET_ENCRYPTION std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { UpdateEncryption(encryption::kDictionaryPage); @@ -193,6 +207,7 @@ class SerializedPageWriter : public PageWriter { encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } +#endif format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); @@ -207,11 +222,16 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_ = start_pos; } +#ifdef PARQUET_ENCRYPTION if (meta_encryptor_) { UpdateEncryption(encryption::kDictionaryPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); +#else + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); +#endif + PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -223,6 +243,7 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { +#ifdef PARQUET_ENCRYPTION if (meta_encryptor_ != nullptr) { UpdateEncryption(encryption::kColumnMetaData); } @@ -231,6 +252,12 @@ class SerializedPageWriter : public PageWriter { metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback, meta_encryptor_); +#else + // index_page_offset = -1 since they are not supported + metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, + total_compressed_size_, total_uncompressed_size_, has_dictionary, + fallback); +#endif // Write metadata at end of column chunk metadata_->WriteTo(sink_.get()); } @@ -271,6 +298,7 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int32_t output_data_len = static_cast(compressed_data->size()); +#ifdef PARQUET_ENCRYPTION std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { UpdateEncryption(encryption::kDataPage); @@ -280,6 +308,7 @@ class SerializedPageWriter : public PageWriter { encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } +#endif format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); @@ -294,11 +323,15 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } +#ifdef PARQUET_ENCRYPTION if (meta_encryptor_) { UpdateEncryption(encryption::kDataPageHeader); } int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); +#else + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); +#endif PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -324,6 +357,7 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: +#ifdef PARQUET_ENCRYPTION void InitEncryption() { // Prepare the AAD for quick update later. if (data_encryptor_ != NULLPTR) { @@ -372,6 +406,7 @@ class SerializedPageWriter : public PageWriter { throw ParquetException("Unknown module type in UpdateEncryption"); } } +#endif std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; @@ -384,21 +419,25 @@ class SerializedPageWriter : public PageWriter { int16_t page_ordinal_; int16_t row_group_ordinal_; int16_t column_ordinal_; - std::string data_pageAAD_; - std::string data_page_headerAAD_; std::unique_ptr thrift_serializer_; // Compression codec to use. std::unique_ptr compressor_; +#ifdef PARQUET_ENCRYPTION + std::string data_pageAAD_; + std::string data_page_headerAAD_; + std::shared_ptr meta_encryptor_; std::shared_ptr data_encryptor_; +#endif }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: +#ifdef PARQUET_ENCRYPTION BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, @@ -406,11 +445,24 @@ class BufferedPageWriter : public PageWriter { MemoryPool* pool = arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) +#else + BufferedPageWriter(const std::shared_ptr& sink, + Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t current_column_ordinal, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) +#endif : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); + +#ifdef PARQUET_ENCRYPTION pager_ = std::unique_ptr(new SerializedPageWriter( in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, meta_encryptor, data_encryptor)); +#else + pager_ = std::unique_ptr( + new SerializedPageWriter(in_memory_sink_, codec, metadata, row_group_ordinal, + current_column_ordinal, pool)); +#endif } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -455,9 +507,13 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group, - std::shared_ptr meta_encryptor, - std::shared_ptr data_encryptor) { + int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group +#ifdef PARQUET_ENCRYPTION + , + std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor +#endif +) { +#ifdef PARQUET_ENCRYPTION if (buffered_row_group) { return std::unique_ptr(new BufferedPageWriter( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, @@ -467,6 +523,15 @@ std::unique_ptr PageWriter::Open( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, meta_encryptor, data_encryptor)); } +#else + if (buffered_row_group) { + return std::unique_ptr(new BufferedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); + } else { + return std::unique_ptr(new SerializedPageWriter( + sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); + } +#endif } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 38cfdfd241da..3c662dbe6683 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -47,7 +47,10 @@ class CompressedDataPage; class DictionaryPage; class ColumnChunkMetaDataBuilder; class WriterProperties; + +#ifdef PARQUET_ENCRYPTION class Encryptor; +#endif class PARQUET_EXPORT LevelEncoder { public: @@ -88,9 +91,13 @@ class PARQUET_EXPORT PageWriter { ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), +#ifdef PARQUET_ENCRYPTION bool buffered_row_group = false, std::shared_ptr header_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR); +#else + bool buffered_row_group = false); +#endif // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index a6e6abbd3d7a..822414713a15 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -32,7 +32,6 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" -#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/file_writer.h" #include "parquet/internal_file_decryptor.h" @@ -42,6 +41,10 @@ #include "parquet/schema.h" #include "parquet/types.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#endif + namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file @@ -84,12 +87,21 @@ class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, int row_group_number, - const ReaderProperties& props, InternalFileDecryptor* file_decryptor) + const ReaderProperties& props +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor +#endif + ) : source_(source), file_metadata_(file_metadata), properties_(props), - row_group_ordinal_((int16_t)row_group_number), - file_decryptor_(file_decryptor) { + row_group_ordinal_((int16_t)row_group_number) +#ifdef PARQUET_ENCRYPTION + , + file_decryptor_(file_decryptor) +#endif + { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -99,7 +111,12 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file +#ifdef PARQUET_ENCRYPTION auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); +#else + auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_); +#endif + int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && col_start > col->dictionary_page_offset()) { @@ -123,6 +140,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); + +#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata = col->crypto_metadata(); // Column is encrypted only if crypto_metadata exists. @@ -159,6 +178,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); +#else + return PageReader::Open(stream, col->num_values(), col->compression(), + properties_.memory_pool()); +#endif } private: @@ -167,7 +190,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; + +#ifdef PARQUET_ENCRYPTION InternalFileDecryptor* file_decryptor_; +#endif }; // ---------------------------------------------------------------------- @@ -190,12 +216,19 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { +#ifdef PARQUET_ENCRYPTION if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); +#endif } std::shared_ptr GetRowGroup(int i) override { - std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); + std::unique_ptr contents( + new SerializedRowGroup(source_, file_metadata_.get(), i, +#ifdef PARQUET_ENCRYPTION + properties_, file_decryptor_.get())); +#else + properties_)); +#endif return std::make_shared(std::move(contents)); } @@ -224,39 +257,32 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(file_size - footer_read_size, footer_read_size, &footer_buffer)); // Check if all bytes are read. Check if last 4 bytes read have the magic bits +#ifdef PARQUET_ENCRYPTION + if (footer_buffer->size() != footer_read_size || + (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && + memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { + throw ParquetException("Invalid parquet file. Corrupt footer."); + } +#else if (footer_buffer->size() != footer_read_size || (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); + } else if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == + 0) { + throw ParquetException( + "Encrypted parquet file. " + "Should build with parquet encryption support."); } +#endif +#if PARQUET_ENCRYPTION // No encryption or encryption with plaintext footer mode. if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { - uint32_t metadata_len = arrow::util::SafeLoadAs( - reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); - int64_t metadata_start = file_size - kFooterSize - metadata_len; - if (kFooterSize + metadata_len > file_size) { - throw ParquetException( - "Invalid parquet file. File is less than " - "file metadata size."); - } - std::shared_ptr metadata_buffer; - // Check if the footer_buffer contains the entire metadata - if (footer_read_size >= (metadata_len + kFooterSize)) { - metadata_buffer = SliceBuffer( - footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len); - } else { - PARQUET_THROW_NOT_OK( - source_->ReadAt(metadata_start, metadata_len, &metadata_buffer)); - if (metadata_buffer->size() != metadata_len) { - throw ParquetException("Invalid parquet file. Could not read metadata bytes."); - } - } - - uint32_t read_metadata_len = metadata_len; - file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len); + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. @@ -275,14 +301,29 @@ class SerializedFile : public ParquetFileReader::Contents { ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, file_size); } +#else // not defined PARQUET_ENCRYPTION + std::shared_ptr metadata_buffer; + uint32_t metadata_len, read_metadata_len; + ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, + &metadata_buffer, &metadata_len, &read_metadata_len); +#endif } private: std::shared_ptr source_; std::shared_ptr file_metadata_; ReaderProperties properties_; + +#ifdef PARQUET_ENCRYPTION std::unique_ptr file_decryptor_; +#endif + + void ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, + int64_t footer_read_size, int64_t file_size, + std::shared_ptr* metadata_buffer, + uint32_t* metadata_len, uint32_t* read_metadata_len); +#if PARQUET_ENCRYPTION std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo); @@ -294,8 +335,39 @@ class SerializedFile : public ParquetFileReader::Contents { void ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size); +#endif }; +void SerializedFile::ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, int64_t footer_read_size, + int64_t file_size, std::shared_ptr* metadata_buffer, uint32_t* metadata_len, + uint32_t* read_metadata_len) { + *metadata_len = arrow::util::SafeLoadAs( + reinterpret_cast(footer_buffer->data()) + footer_read_size - + kFooterSize); + int64_t metadata_start = file_size - kFooterSize - *metadata_len; + if (kFooterSize + *metadata_len > file_size) { + throw ParquetException( + "Invalid parquet file. File is less than " + "file metadata size."); + } + + // Check if the footer_buffer contains the entire metadata + if (footer_read_size >= (*metadata_len + kFooterSize)) { + *metadata_buffer = SliceBuffer( + footer_buffer, footer_read_size - *metadata_len - kFooterSize, *metadata_len); + } else { + PARQUET_THROW_NOT_OK(source_->ReadAt(metadata_start, *metadata_len, metadata_buffer)); + if ((*metadata_buffer)->size() != *metadata_len) { + throw ParquetException("Invalid parquet file. Could not read metadata bytes."); + } + } + + *read_metadata_len = *metadata_len; + file_metadata_ = FileMetaData::Make((*metadata_buffer)->data(), read_metadata_len); +} + +#ifdef PARQUET_ENCRYPTION void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size) { @@ -373,8 +445,8 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( } auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->verify_signature( - encryptor, metadata_buffer->data() + read_metadata_len)) { + if (!file_metadata_->VerifySignature(encryptor, + metadata_buffer->data() + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " "footer metadata"); @@ -424,6 +496,7 @@ std::string SerializedFile::HandleAadPrefix( } return aad_prefix + algo.aad.aad_file_unique; } +#endif // PARQUET_ENCRYPTION // ---------------------------------------------------------------------- // ParquetFileReader public API @@ -510,6 +583,7 @@ std::shared_ptr ParquetFileReader::RowGroup(int i) { DCHECK(i < metadata()->num_row_groups()) << "The file only has " << metadata()->num_row_groups() << "row groups, requested reader for: " << i; + return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index bfcf636301b5..72abbf4407b4 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -24,12 +24,15 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" -#include "parquet/encryption_internal.h" #include "parquet/exception.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/schema.h" +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" +#endif + using arrow::MemoryPool; using parquet::schema::GroupNode; @@ -80,8 +83,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, - const WriterProperties* properties, bool buffered_row_group = false, - InternalFileEncryptor* file_encryptor = NULLPTR) + const WriterProperties* properties, bool buffered_row_group = false +#ifdef PARQUET_ENCRYPTION + , + InternalFileEncryptor* file_encryptor = NULLPTR +#endif + ) : sink_(sink), metadata_(metadata), properties_(properties), @@ -90,8 +97,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { row_group_ordinal_ (row_group_ordinal), next_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group), - file_encryptor_(file_encryptor) { + buffered_row_group_(buffered_row_group) +#ifdef PARQUET_ENCRYPTION + , + file_encryptor_(file_encryptor) +#endif + { if (buffered_row_group) { InitColumns(); } else { @@ -127,6 +138,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++next_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); +#ifdef PARQUET_ENCRYPTION auto meta_encryptor = file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) : NULLPTR; @@ -138,6 +150,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { sink_, properties_->compression(column_descr->path()), col_meta, row_group_ordinal_, static_cast(current_column_index_ - 1), properties_->memory_pool(), false, meta_encryptor, data_encryptor); +#else + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(column_descr->path()), col_meta, + row_group_ordinal_, static_cast(current_column_index_ - 1), + properties_->memory_pool(), false); +#endif column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -206,7 +224,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { int next_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; + +#ifdef PARQUET_ENCRYPTION InternalFileEncryptor* file_encryptor_; +#endif void CheckRowsWritten() const { // verify when only one column is written at a time @@ -234,6 +255,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); +#ifdef PARQUET_ENCRYPTION auto meta_encryptor = file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) : NULLPTR; @@ -245,6 +267,13 @@ class RowGroupSerializer : public RowGroupWriter::Contents { static_cast(row_group_ordinal_), static_cast(current_column_index_), properties_->memory_pool(), buffered_row_group_, meta_encryptor, data_encryptor); +#else + std::unique_ptr pager = + PageWriter::Open(sink_, properties_->compression(column_descr->path()), + col_meta, static_cast(row_group_ordinal_), + static_cast(current_column_index_), + properties_->memory_pool(), buffered_row_group_); +#endif column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -283,6 +312,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } row_group_writer_.reset(); +#ifdef PARQUET_ENCRYPTION // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); @@ -292,6 +322,10 @@ class FileSerializer : public ParquetFileWriter::Contents { } else { // Encrypted file CloseEncryptedFile(file_encryption_properties); } +#else + file_metadata_ = metadata_->Finish(); + WriteFileMetaData(*file_metadata_, sink_.get()); +#endif } } @@ -313,8 +347,11 @@ class FileSerializer : public ParquetFileWriter::Contents { auto rg_metadata = metadata_->AppendRowGroup(); std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), +#ifdef PARQUET_ENCRYPTION buffered_row_group, file_encryptor_.get())); - +#else + buffered_row_group)); +#endif row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } @@ -345,6 +382,7 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } +#ifdef PARQUET_ENCRYPTION void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { // Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { @@ -374,6 +412,7 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_->WipeOutEncryptionKeys(); } } +#endif std::shared_ptr sink_; bool is_open_; @@ -384,9 +423,12 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; +#ifdef PARQUET_ENCRYPTION std::unique_ptr file_encryptor_; +#endif void StartFile() { +#ifdef PARQUET_ENCRYPTION auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 @@ -400,6 +442,10 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } +#else + // Unencrypted parquet files always start with PAR1 + PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); +#endif } }; @@ -450,6 +496,12 @@ void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sin PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); } +void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { + PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); + return WriteFileMetaData(file_metadata, sink); +} + +#ifdef PARQUET_ENCRYPTION void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, @@ -493,11 +545,7 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ParquetOutputWrapper wrapper(sink); crypto_metadata.WriteTo(&wrapper); } - -void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) { - PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4)); - return WriteFileMetaData(file_metadata, sink); -} +#endif // PARQUET_ENCRYPTION const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 51e9da7757b3..1c97ed5c1902 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -99,6 +99,16 @@ class PARQUET_EXPORT RowGroupWriter { ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); + +PARQUET_EXPORT +void WriteFileMetaData(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); + +PARQUET_EXPORT +void WriteMetaDataFile(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); + +#ifdef PARQUET_ENCRYPTION PARQUET_EXPORT void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ArrowOutputStream* sink, @@ -107,10 +117,6 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, OutputStream* sink); - -PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); PARQUET_EXPORT void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink, @@ -119,10 +125,7 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, PARQUET_EXPORT void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ::arrow::io::OutputStream* sink); - -PARQUET_EXPORT -void WriteMetaDataFile(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); +#endif class PARQUET_EXPORT ParquetFileWriter { public: diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9ac7cf9c5edf..664506e17018 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -26,7 +26,6 @@ #include #include // IWYU pragma: keep #include "parquet/exception.h" -#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -46,6 +45,10 @@ using ::std::regex_match; using ::std::smatch; #endif +#ifdef PARQUET_ENCRYPTION +#include "parquet/internal_file_decryptor.h" +#endif + namespace parquet { const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() { @@ -125,6 +128,8 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d } // MetaData Accessor + +#ifdef PARQUET_ENCRYPTION // ColumnCryptoMetaData class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { public: @@ -170,6 +175,7 @@ bool ColumnCryptoMetaData::encrypted_with_footer_key() const { const std::string& ColumnCryptoMetaData::key_metadata() const { return impl_->key_metadata(); } +#endif // PARQUET_ENCRYPTION // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { @@ -177,9 +183,14 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor = NULLPTR) + const ApplicationVersion* writer_version +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor = NULLPTR +#endif + ) : column_(column), descr_(descr), writer_version_(writer_version) { +#ifdef PARQUET_ENCRYPTION if (column->__isset.crypto_metadata) { // column metadata is encrypted format::ColumnCryptoMetaData ccmd = column->crypto_metadata; @@ -208,6 +219,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { // column metadata is not encrypted is_metadata_set_ = true; } +#else + is_metadata_set_ = true; +#endif // PARQUET_ENCRYPTION + if (is_metadata_set_) { const format::ColumnMetaData& meta_data = GetMetadataIfSet(); for (auto encoding : meta_data.encodings) { @@ -288,6 +303,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return GetMetadataIfSet().total_uncompressed_size; } +#ifdef PARQUET_ENCRYPTION inline std::unique_ptr crypto_metadata() const { if (column_->__isset.crypto_metadata) { return ColumnCryptoMetaData::Make( @@ -296,6 +312,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return nullptr; } } +#endif private: mutable std::shared_ptr possible_stats_; @@ -307,6 +324,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { +#ifdef PARQUET_ENCRYPTION if (column_->__isset.crypto_metadata && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { @@ -319,27 +337,51 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { return column_->meta_data; } +#else + return column_->meta_data; +#endif } }; std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor, - int16_t row_group_ordinal, int16_t column_ordinal) { + const ApplicationVersion* writer_version, int16_t row_group_ordinal, + int16_t column_ordinal +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor +#endif +) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, +#ifdef PARQUET_ENCRYPTION writer_version, file_decryptor)); +#else + writer_version)); +#endif } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version, - InternalFileDecryptor* file_decryptor) + const ApplicationVersion* writer_version +#ifdef PARQUET_ENCRYPTION + , + InternalFileDecryptor* file_decryptor +#endif + ) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} + row_group_ordinal, column_ordinal, +#ifdef PARQUET_ENCRYPTION + writer_version, file_decryptor))} { +} +#else + writer_version))} { +} +#endif + ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); } @@ -397,9 +439,11 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } +#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { return impl_->crypto_metadata(); } +#endif // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { @@ -424,7 +468,12 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk( - int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { + int i, +#ifdef PARQUET_ENCRYPTION + int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { +#else + int16_t row_group_ordinal) { +#endif if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -432,8 +481,12 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, file_decryptor, row_group_ordinal, + writer_version_, row_group_ordinal, +#ifdef PARQUET_ENCRYPTION + (int16_t)i, file_decryptor); +#else (int16_t)i); +#endif } private: @@ -464,22 +517,37 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } +#ifdef PARQUET_ENCRYPTION std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } +#else +std::unique_ptr RowGroupMetaData::ColumnChunk( + int i, int16_t row_group_ordinal) const { + return impl_->ColumnChunk(i, row_group_ordinal); +} +#endif // file metadata class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} +#ifdef PARQUET_ENCRYPTION explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = nullptr) +#else + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) +#endif : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, +#ifdef PARQUET_ENCRYPTION metadata_.get(), decryptor); +#else + metadata_.get()); +#endif metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -493,8 +561,9 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } - bool verify_signature(std::shared_ptr encryptor, - const void* tail) { +#ifdef PARQUET_ENCRYPTION + bool VerifySignature(std::shared_ptr encryptor, + const void* signature) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -502,8 +571,8 @@ class FileMetaData::FileMetaDataImpl { serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); // encrypt with nonce - uint8_t* nonce = const_cast(reinterpret_cast(tail)); - uint8_t* tag = const_cast(reinterpret_cast(tail)) + + uint8_t* nonce = const_cast(reinterpret_cast(signature)); + uint8_t* tag = const_cast(reinterpret_cast(signature)) + encryption::kNonceLength; std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + @@ -514,6 +583,7 @@ class FileMetaData::FileMetaDataImpl { memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, tag, encryption::kGcmTagLength); } +#endif inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } @@ -526,6 +596,8 @@ class FileMetaData::FileMetaDataImpl { inline int num_schema_elements() const { return static_cast(metadata_->schema.size()); } + +#ifdef PARQUET_ENCRYPTION inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } @@ -535,9 +607,11 @@ class FileMetaData::FileMetaDataImpl { inline const std::string& footer_signing_key_metadata() { return metadata_->footer_signing_key_metadata; } +#endif const ApplicationVersion& writer_version() const { return writer_version_; } +#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; @@ -567,6 +641,12 @@ class FileMetaData::FileMetaDataImpl { serializer.Serialize(metadata_.get(), dst, encryptor); } } +#else + void WriteTo(::arrow::io::OutputStream* dst) const { + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); + } +#endif // PARQUET_ENCRYPTION std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -649,6 +729,7 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; +#ifdef PARQUET_ENCRYPTION std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) { @@ -656,11 +737,24 @@ std::shared_ptr FileMetaData::Make( return std::shared_ptr( new FileMetaData(metadata, metadata_len, decryptor)); } +#else +std::shared_ptr FileMetaData::Make(const void* metadata, + uint32_t* metadata_len) { + // This FileMetaData ctor is private, not compatible with std::make_shared + return std::shared_ptr(new FileMetaData(metadata, metadata_len)); +} +#endif +#ifdef PARQUET_ENCRYPTION FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} +#else +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) + : impl_{std::unique_ptr( + new FileMetaDataImpl(metadata, metadata_len))} {} +#endif FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -671,10 +765,12 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -bool FileMetaData::verify_signature(std::shared_ptr encryptor, - const void* tail) { - return impl_->verify_signature(encryptor, tail); +#ifdef PARQUET_ENCRYPTION +bool FileMetaData::VerifySignature(std::shared_ptr encryptor, + const void* signature) { + return impl_->VerifySignature(encryptor, signature); } +#endif uint32_t FileMetaData::size() const { return impl_->size(); } @@ -684,6 +780,7 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } +#ifdef PARQUET_ENCRYPTION bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } @@ -695,6 +792,7 @@ EncryptionAlgorithm FileMetaData::encryption_algorithm() const { const std::string& FileMetaData::footer_signing_key_metadata() const { return impl_->footer_signing_key_metadata(); } +#endif // PARQUET_ENCRYPTION ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { @@ -729,11 +827,18 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } +#ifdef PARQUET_ENCRYPTION void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); } +#else +void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { + return impl_->WriteTo(dst); +} +#endif +#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData::FileCryptoMetaDataImpl { public: FileCryptoMetaDataImpl() {} @@ -786,6 +891,7 @@ FileCryptoMetaData::~FileCryptoMetaData() {} void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } +#endif // PARQUET_ENCRYPTION ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) @@ -923,7 +1029,12 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback, const std::shared_ptr& encryptor) { + bool dictionary_fallback +#ifdef PARQUET_ENCRYPTION + , + const std::shared_ptr& encryptor +#endif + ) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -958,6 +1069,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->meta_data.__set_encodings(thrift_encodings); +#ifdef PARQUET_ENCRYPTION const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is encrypted if (encrypt_md != NULLPTR && encrypt_md->is_encrypted()) { @@ -1011,6 +1123,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } } +#endif // PARQUET_ENCRYPTION } void WriteTo(::arrow::io::OutputStream* sink) { @@ -1077,11 +1190,19 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, +#ifdef PARQUET_ENCRYPTION bool dictionary_fallback, const std::shared_ptr& encryptor) { +#else + bool dictionary_fallback) { +#endif impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, - encryptor); + compressed_size, uncompressed_size, has_dictionary, +#ifdef PARQUET_ENCRYPTION + dictionary_fallback, encryptor); +#else + dictionary_fallback); +#endif } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { @@ -1229,10 +1350,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); +#ifdef PARQUET_ENCRYPTION if (props->file_encryption_properties() != nullptr && props->file_encryption_properties()->encrypted_footer()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } +#endif } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -1288,6 +1411,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; +#ifdef PARQUET_ENCRYPTION // if plaintext footer, set footer signing algorithm auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { @@ -1306,6 +1430,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); } } +#endif // PARQUET_ENCRYPTION parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), @@ -1317,6 +1442,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } +#ifdef PARQUET_ENCRYPTION std::unique_ptr BuildFileCryptoMetaData() { if (crypto_metadata_ == nullptr) { return nullptr; @@ -1338,10 +1464,13 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_crypto_metadata; } +#endif protected: std::unique_ptr metadata_; +#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata_; +#endif private: const std::shared_ptr properties_; @@ -1373,8 +1502,10 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } +#ifdef PARQUET_ENCRYPTION std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); } +#endif } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 89ea24527c54..186a27ae9c36 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -37,11 +37,14 @@ class ColumnDescriptor; class EncodedStatistics; class Statistics; class SchemaDescriptor; + +#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData; class InternalFileDecryptor; class Decryptor; class Encryptor; class FooterSigningEncryptor; +#endif namespace schema { @@ -104,6 +107,7 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; +#ifdef PARQUET_ENCRYPTION class PARQUET_EXPORT ColumnCryptoMetaData { public: static std::unique_ptr Make(const uint8_t* metadata); @@ -119,15 +123,22 @@ class PARQUET_EXPORT ColumnCryptoMetaData { class ColumnCryptoMetaDataImpl; std::unique_ptr impl_; }; +#endif class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor +#ifdef PARQUET_ENCRYPTION + static std::unique_ptr Make( + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, + int16_t column_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR); +#else static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR, - InternalFileDecryptor* file_decryptor = NULLPTR, int16_t row_group_ordinal = -1, + const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1); +#endif ~ColumnChunkMetaData(); @@ -153,13 +164,21 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; +#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata() const; +#endif private: +#ifdef PARQUET_ENCRYPTION explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); +#else + explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + int16_t row_group_ordinal, int16_t column_ordinal, + const ApplicationVersion* writer_version = NULLPTR); +#endif // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -180,9 +199,15 @@ class PARQUET_EXPORT RowGroupMetaData { int64_t total_byte_size() const; // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; + +#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR) const; +#else + std::unique_ptr ColumnChunk(int i, + int16_t row_group_ordinal = -1) const; +#endif private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -197,14 +222,25 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor + +#ifdef PARQUET_ENCRYPTION static std::shared_ptr Make( const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); +#else + static std::shared_ptr Make(const void* serialized_metadata, + uint32_t* metadata_len); +#endif ~FileMetaData(); +#ifdef PARQUET_ENCRYPTION + /// Verify signature of FileMetadata when file is encrypted but footer is not encrypted + /// (plaintext footer). + /// Signature is 28 bytes (12 byte nonce and 16 byte tags) when encrypting FileMetadata + bool VerifySignature(std::shared_ptr encryptor, + const void* signature); +#endif - bool verify_signature(std::shared_ptr encryptor, - const void* tail); // file metadata uint32_t size() const; @@ -213,18 +249,24 @@ class PARQUET_EXPORT FileMetaData { int64_t num_rows() const; int num_row_groups() const; - bool is_encryption_algorithm_set() const; - EncryptionAlgorithm encryption_algorithm() const; - const std::string& footer_signing_key_metadata() const; ParquetVersion::type version() const; const std::string& created_by() const; int num_schema_elements() const; std::unique_ptr RowGroup(int i) const; - const ApplicationVersion& writer_version() const; +#ifdef PARQUET_ENCRYPTION + bool is_encryption_algorithm_set() const; + EncryptionAlgorithm encryption_algorithm() const; + const std::string& footer_signing_key_metadata() const; +#endif + +#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; +#else + void WriteTo(::arrow::io::OutputStream* dst) const; +#endif // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -239,8 +281,13 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; + +#ifdef PARQUET_ENCRYPTION explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); +#else + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); +#endif // PIMPL Idiom FileMetaData(); @@ -248,6 +295,7 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; +#ifdef PARQUET_ENCRYPTION class PARQUET_EXPORT FileCryptoMetaData { public: // API convenience to get a MetaData accessor @@ -269,6 +317,7 @@ class PARQUET_EXPORT FileCryptoMetaData { class FileCryptoMetaDataImpl; std::unique_ptr impl_; }; +#endif // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { @@ -293,11 +342,19 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { int64_t total_compressed_size() const; // commit the metadata + +#ifdef PARQUET_ENCRYPTION void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback, const std::shared_ptr& encryptor = NULLPTR); +#else + void Finish(int64_t num_values, int64_t dictonary_page_offset, + int64_t index_page_offset, int64_t data_page_offset, + int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, + bool dictionary_fallback); +#endif // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; @@ -357,8 +414,10 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); +#ifdef PARQUET_ENCRYPTION // crypto metadata std::unique_ptr GetCryptoMetaData(); +#endif private: explicit FileMetaDataBuilder( diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index bed07a4cc57c..a1cb1b134500 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -26,7 +26,10 @@ #include "arrow/type.h" +#ifdef PARQUET_ENCRYPTION #include "parquet/encryption.h" +#endif + #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" @@ -65,6 +68,7 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } +#ifdef PARQUET_ENCRYPTION void file_decryption_properties( const std::shared_ptr& decryption) { file_decryption_properties_ = decryption; @@ -73,12 +77,16 @@ class PARQUET_EXPORT ReaderProperties { FileDecryptionProperties* file_decryption_properties() { return file_decryption_properties_.get(); } +#endif private: MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_decryption_properties_; +#endif }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -283,11 +291,13 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } +#ifdef PARQUET_ENCRYPTION Builder* encryption( const std::shared_ptr& file_encryption_properties) { file_encryption_properties_ = file_encryption_properties; return this; } +#endif Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); @@ -334,10 +344,13 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr(new WriterProperties( - pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, - pagesize_, version_, created_by_, std::move(file_encryption_properties_), - default_column_properties_, column_properties)); + return std::shared_ptr( + new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, + max_row_group_length_, pagesize_, version_, created_by_, +#ifdef PARQUET_ENCRYPTION + std::move(file_encryption_properties_), +#endif + default_column_properties_, column_properties)); } private: @@ -348,7 +361,10 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type version_; std::string created_by_; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; +#endif // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -372,10 +388,6 @@ class PARQUET_EXPORT WriterProperties { inline std::string created_by() const { return parquet_created_by_; } - inline FileEncryptionProperties* file_encryption_properties() const { - return file_encryption_properties_.get(); - } - inline Encoding::type dictionary_index_encoding() const { if (parquet_version_ == ParquetVersion::PARQUET_1_0) { return Encoding::PLAIN_DICTIONARY; @@ -419,6 +431,11 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } +#ifdef PARQUET_ENCRYPTION + inline FileEncryptionProperties* file_encryption_properties() const { + return file_encryption_properties_.get(); + } + std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { @@ -427,13 +444,16 @@ class PARQUET_EXPORT WriterProperties { return NULLPTR; } } +#endif private: explicit WriterProperties( MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties, +#endif const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -443,9 +463,12 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), +#ifdef PARQUET_ENCRYPTION file_encryption_properties_(file_encryption_properties), +#endif default_column_properties_(default_column_properties), - column_properties_(column_properties) {} + column_properties_(column_properties) { + } MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -454,7 +477,11 @@ class PARQUET_EXPORT WriterProperties { int64_t pagesize_; ParquetVersion::type parquet_version_; std::string parquet_created_by_; + +#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; +#endif + ColumnProperties default_column_properties_; std::unordered_map column_properties_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 1e5fdc3d84db..10c202a2f2d9 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -42,16 +42,19 @@ #include #include "arrow/util/logging.h" -#include "parquet/encryption_internal.h" + #include "parquet/exception.h" -#include "parquet/internal_file_decryptor.h" -#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/statistics.h" #include "parquet/types.h" #include "parquet/parquet_types.h" // IYWU pragma: export +#ifdef PARQUET_ENCRYPTION +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" +#endif namespace parquet { // Check if thrift version < 0.11.0 @@ -222,8 +225,13 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, - const std::shared_ptr& decryptor = NULLPTR) { +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg +#ifdef PARQUET_ENCRYPTION + , + const std::shared_ptr& decryptor = NULLPTR +#endif +) { +#ifdef PARQUET_ENCRYPTION // thrift message is not encrypted if (decryptor == NULLPTR) { DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); @@ -245,6 +253,9 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, deserialized_msg); } +#else + DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); +#endif // PARQUET_ENCRYPTION } /// Utility class to serialize thrift objects to a binary format. This object @@ -275,12 +286,17 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out, - const std::shared_ptr& encryptor = NULLPTR) { + int64_t Serialize(const T* obj, ArrowOutputStream* out +#ifdef PARQUET_ENCRYPTION + , + const std::shared_ptr& encryptor = NULLPTR +#endif + ) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); +#ifdef PARQUET_ENCRYPTION // obj is not encrypted if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); @@ -288,6 +304,10 @@ class ThriftSerializer { } else { // obj is encrypted return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); } +#else + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); +#endif } private: @@ -303,6 +323,7 @@ class ThriftSerializer { } } +#ifdef PARQUET_ENCRYPTION int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, uint32_t out_length, const std::shared_ptr& encryptor) { @@ -316,6 +337,7 @@ class ThriftSerializer { PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); return static_cast(cipher_buffer_len); } +#endif shared_ptr mem_buffer_; shared_ptr protocol_; From 2e0ef53f577721fa8d5c28b53a34c3eb02b8a6f2 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 24 Jun 2019 11:42:16 +0300 Subject: [PATCH 115/201] Fix LogicalType --- ...cryption-reader-writer-all-crypto-options.h | 10 +++++----- cpp/src/parquet/CMakeLists.txt | 4 ++-- .../parquet/encryption-configurations-test.cc | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h index db1b692b9f08..2ca3a064768a 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h @@ -21,7 +21,7 @@ #include #include -using parquet::LogicalType; +using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; using parquet::schema::GroupNode; @@ -34,18 +34,18 @@ static std::shared_ptr SetupSchema() { // Create a primitive node named 'boolean_field' with type:BOOLEAN, // repetition:REQUIRED fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, LogicalType::NONE)); + Type::BOOLEAN, ConvertedType::NONE)); // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, // logical type:TIME_MILLIS fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - LogicalType::TIME_MILLIS)); + ConvertedType::TIME_MILLIS)); fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, - LogicalType::NONE)); + ConvertedType::NONE)); // Create a GroupNode named 'schema' using the primitive nodes defined above // This GroupNode is the root node of the schema tree diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 32888014080a..a493966bf685 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -330,8 +330,8 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES - encryption-properties-test.cc - encryption-configurations-test.cc) + encryption-configurations-test.cc + encryption-properties-test.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index 2afc20e52234..b45c47f365bd 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -73,7 +73,7 @@ namespace parquet { using schema::GroupNode; using schema::NodePtr; using schema::PrimitiveNode; -using parquet::LogicalType; +using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; @@ -540,34 +540,34 @@ class TestEncryptionConfiguration : public ::testing::Test { // Create a primitive node named 'boolean_field' with type:BOOLEAN, // repetition:REQUIRED fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, LogicalType::NONE)); + Type::BOOLEAN, ConvertedType::NONE)); // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, // logical type:TIME_MILLIS fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - LogicalType::TIME_MILLIS)); + ConvertedType::TIME_MILLIS)); // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - LogicalType::NONE)); + ConvertedType::NONE)); fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, - Type::DOUBLE, LogicalType::NONE)); + Type::DOUBLE, ConvertedType::NONE)); // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, - Type::BYTE_ARRAY, LogicalType::NONE)); + Type::BYTE_ARRAY, ConvertedType::NONE)); // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, // repetition:REQUIRED, field_length = kFixedLength fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, kFixedLength)); // Create a GroupNode named 'schema' using the primitive nodes defined above From 376b4adc821f1f0faaf56ce3f21975fd3115a96b Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Mon, 24 Jun 2019 23:35:20 +0700 Subject: [PATCH 116/201] keep encryption parameters at method declaration --- cpp/src/parquet/column_reader.cc | 10 +-- cpp/src/parquet/column_reader.h | 7 +- cpp/src/parquet/file_reader.cc | 37 ++++------ cpp/src/parquet/file_writer.cc | 27 +++---- cpp/src/parquet/metadata.cc | 116 +++++-------------------------- cpp/src/parquet/metadata.h | 40 ----------- cpp/src/parquet/properties.h | 31 +++------ cpp/src/parquet/thrift.h | 21 +++--- 8 files changed, 65 insertions(+), 224 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 4b831a8d3b72..f2ae8454a5e2 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -131,14 +131,10 @@ class SerializedPageReader : public PageReader { column_ordinal_(-1), page_ordinal_(-1), seen_num_rows_(0), - total_num_rows_(total_num_rows) -#ifdef PARQUET_ENCRYPTION - , + total_num_rows_(total_num_rows), decryption_buffer_(AllocateBuffer(pool, 0)), meta_decryptor_(NULLPTR), - data_decryptor_(NULLPTR) -#endif - { + data_decryptor_(NULLPTR) { if (ctx != NULLPTR) { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; @@ -209,7 +205,6 @@ class SerializedPageReader : public PageReader { // Number of rows in all the data pages int64_t total_num_rows_; -#ifdef PARQUET_ENCRYPTION // data_pageAAD_ and data_page_headerAAD_ contain the AAD for data page and data page // header in a single column respectively. // While calculating AAD for different pages in a single column the pages AAD is @@ -220,7 +215,6 @@ class SerializedPageReader : public PageReader { std::shared_ptr decryption_buffer_; std::shared_ptr meta_decryptor_; std::shared_ptr data_decryptor_; -#endif }; #ifdef PARQUET_ENCRYPTION diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 1298b15b62db..aa2e3a3c069f 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -43,11 +43,8 @@ class RleDecoder; namespace parquet { -class Page; - -#ifdef PARQUET_ENCRYPTION class Decryptor; -#endif +class Page; // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -81,10 +78,8 @@ struct PageReaderContext { int16_t row_group_ordinal; int16_t column_ordinal; -#ifdef PARQUET_ENCRYPTION std::shared_ptr meta_decryptor; std::shared_ptr data_decryptor; -#endif }; // Abstract page iterator interface. This way, we can feed column pages to the diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 822414713a15..d751f46338a9 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -34,7 +34,6 @@ #include "parquet/deprecated_io.h" #include "parquet/exception.h" #include "parquet/file_writer.h" -#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -43,6 +42,11 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" +#else +namespace parquet { +class InternalFileDecryptor; +} #endif namespace parquet { @@ -87,21 +91,13 @@ class SerializedRowGroup : public RowGroupReader::Contents { public: SerializedRowGroup(const std::shared_ptr& source, FileMetaData* file_metadata, int row_group_number, - const ReaderProperties& props -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor -#endif - ) + const ReaderProperties& props, + InternalFileDecryptor* file_decryptor = NULLPTR) : source_(source), file_metadata_(file_metadata), properties_(props), - row_group_ordinal_((int16_t)row_group_number) -#ifdef PARQUET_ENCRYPTION - , - file_decryptor_(file_decryptor) -#endif - { + row_group_ordinal_((int16_t)row_group_number), + file_decryptor_(file_decryptor) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -111,11 +107,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr GetColumnPageReader(int i) override { // Read column chunk from the file -#ifdef PARQUET_ENCRYPTION auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_, file_decryptor_); -#else - auto col = row_group_metadata_->ColumnChunk(i, row_group_ordinal_); -#endif int64_t col_start = col->data_page_offset(); if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 && @@ -190,10 +182,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr row_group_metadata_; ReaderProperties properties_; int16_t row_group_ordinal_; - -#ifdef PARQUET_ENCRYPTION InternalFileDecryptor* file_decryptor_; -#endif }; // ---------------------------------------------------------------------- @@ -222,12 +211,12 @@ class SerializedFile : public ParquetFileReader::Contents { } std::shared_ptr GetRowGroup(int i) override { - std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), i, #ifdef PARQUET_ENCRYPTION - properties_, file_decryptor_.get())); + std::unique_ptr contents(new SerializedRowGroup( + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); #else - properties_)); + std::unique_ptr contents( + new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); #endif return std::make_shared(std::move(contents)); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 72abbf4407b4..d5d7abacfc4d 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -31,6 +31,10 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/encryption_internal.h" #include "parquet/internal_file_encryptor.h" +#else +namespace parquet { +class InternalFileEncryptor; +} #endif using arrow::MemoryPool; @@ -83,12 +87,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { public: RowGroupSerializer(const std::shared_ptr& sink, RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, - const WriterProperties* properties, bool buffered_row_group = false -#ifdef PARQUET_ENCRYPTION - , - InternalFileEncryptor* file_encryptor = NULLPTR -#endif - ) + const WriterProperties* properties, bool buffered_row_group = false, + InternalFileEncryptor* file_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), properties_(properties), @@ -97,12 +97,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { row_group_ordinal_ (row_group_ordinal), next_column_index_(0), num_rows_(0), - buffered_row_group_(buffered_row_group) -#ifdef PARQUET_ENCRYPTION - , - file_encryptor_(file_encryptor) -#endif - { + buffered_row_group_(buffered_row_group), + file_encryptor_(file_encryptor) { if (buffered_row_group) { InitColumns(); } else { @@ -224,10 +220,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { int next_column_index_; mutable int64_t num_rows_; bool buffered_row_group_; - -#ifdef PARQUET_ENCRYPTION InternalFileEncryptor* file_encryptor_; -#endif void CheckRowsWritten() const { // verify when only one column is written at a time @@ -345,11 +338,13 @@ class FileSerializer : public ParquetFileWriter::Contents { } num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); +#ifdef PARQUET_ENCRYPTION std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), -#ifdef PARQUET_ENCRYPTION buffered_row_group, file_encryptor_.get())); #else + std::unique_ptr contents(new RowGroupSerializer( + sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), buffered_row_group)); #endif row_group_writer_.reset(new RowGroupWriter(std::move(contents))); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 664506e17018..b1c4f0e08657 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -47,6 +47,10 @@ using ::std::smatch; #ifdef PARQUET_ENCRYPTION #include "parquet/internal_file_decryptor.h" +#else +namespace parquet { +class Decryptor; +} #endif namespace parquet { @@ -183,12 +187,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor = NULLPTR -#endif - ) + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { #ifdef PARQUET_ENCRYPTION if (column->__isset.crypto_metadata) { // column metadata is encrypted @@ -346,41 +346,21 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { std::unique_ptr ColumnChunkMetaData::Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version, int16_t row_group_ordinal, - int16_t column_ordinal -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor -#endif -) { + int16_t column_ordinal, InternalFileDecryptor* file_decryptor) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal, -#ifdef PARQUET_ENCRYPTION writer_version, file_decryptor)); -#else - writer_version)); -#endif } ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version -#ifdef PARQUET_ENCRYPTION - , - InternalFileDecryptor* file_decryptor -#endif - ) + const ApplicationVersion* writer_version, + InternalFileDecryptor* file_decryptor) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( reinterpret_cast(metadata), descr, - row_group_ordinal, column_ordinal, -#ifdef PARQUET_ENCRYPTION - writer_version, file_decryptor))} { -} -#else - writer_version))} { -} -#endif + row_group_ordinal, column_ordinal, writer_version, file_decryptor))} {} ColumnChunkMetaData::~ColumnChunkMetaData() {} // column chunk @@ -468,12 +448,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk( - int i, -#ifdef PARQUET_ENCRYPTION - int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { -#else - int16_t row_group_ordinal) { -#endif + int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor = NULLPTR) { if (!(i < num_columns())) { std::stringstream ss; ss << "The file only has " << num_columns() @@ -481,12 +456,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { throw ParquetException(ss.str()); } return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), - writer_version_, row_group_ordinal, -#ifdef PARQUET_ENCRYPTION - (int16_t)i, file_decryptor); -#else - (int16_t)i); -#endif + writer_version_, row_group_ordinal, (int16_t)i, + file_decryptor); } private: @@ -517,37 +488,22 @@ int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_siz const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); } -#ifdef PARQUET_ENCRYPTION std::unique_ptr RowGroupMetaData::ColumnChunk( int i, int16_t row_group_ordinal, InternalFileDecryptor* file_decryptor) const { return impl_->ColumnChunk(i, row_group_ordinal, file_decryptor); } -#else -std::unique_ptr RowGroupMetaData::ColumnChunk( - int i, int16_t row_group_ordinal) const { - return impl_->ColumnChunk(i, row_group_ordinal); -} -#endif // file metadata class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} -#ifdef PARQUET_ENCRYPTION explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = nullptr) -#else - explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) -#endif : metadata_len_(0) { metadata_.reset(new format::FileMetaData); DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, -#ifdef PARQUET_ENCRYPTION metadata_.get(), decryptor); -#else - metadata_.get()); -#endif metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -611,10 +567,10 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } -#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; +#ifdef PARQUET_ENCRYPTION // Only in encrypted files with plaintext footers the // encryption_algorithm is set in footer if (is_encryption_algorithm_set()) { @@ -640,13 +596,10 @@ class FileMetaData::FileMetaDataImpl { // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor); } - } #else - void WriteTo(::arrow::io::OutputStream* dst) const { - ThriftSerializer serializer; serializer.Serialize(metadata_.get(), dst); - } #endif // PARQUET_ENCRYPTION + } std::unique_ptr RowGroup(int i) { if (!(i < num_row_groups())) { @@ -729,7 +682,6 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -#ifdef PARQUET_ENCRYPTION std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) { @@ -737,24 +689,11 @@ std::shared_ptr FileMetaData::Make( return std::shared_ptr( new FileMetaData(metadata, metadata_len, decryptor)); } -#else -std::shared_ptr FileMetaData::Make(const void* metadata, - uint32_t* metadata_len) { - // This FileMetaData ctor is private, not compatible with std::make_shared - return std::shared_ptr(new FileMetaData(metadata, metadata_len)); -} -#endif -#ifdef PARQUET_ENCRYPTION FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len, decryptor))} {} -#else -FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) - : impl_{std::unique_ptr( - new FileMetaDataImpl(metadata, metadata_len))} {} -#endif FileMetaData::FileMetaData() : impl_{std::unique_ptr(new FileMetaDataImpl())} {} @@ -827,16 +766,10 @@ void FileMetaData::AppendRowGroups(const FileMetaData& other) { impl_->AppendRowGroups(other.impl_); } -#ifdef PARQUET_ENCRYPTION void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); } -#else -void FileMetaData::WriteTo(::arrow::io::OutputStream* dst) const { - return impl_->WriteTo(dst); -} -#endif #ifdef PARQUET_ENCRYPTION class FileCryptoMetaData::FileCryptoMetaDataImpl { @@ -1029,12 +962,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void Finish(int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback -#ifdef PARQUET_ENCRYPTION - , - const std::shared_ptr& encryptor -#endif - ) { + bool dictionary_fallback, const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); @@ -1190,19 +1118,11 @@ void ColumnChunkMetaDataBuilder::Finish(int64_t num_values, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, -#ifdef PARQUET_ENCRYPTION bool dictionary_fallback, const std::shared_ptr& encryptor) { -#else - bool dictionary_fallback) { -#endif impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset, - compressed_size, uncompressed_size, has_dictionary, -#ifdef PARQUET_ENCRYPTION - dictionary_fallback, encryptor); -#else - dictionary_fallback); -#endif + compressed_size, uncompressed_size, has_dictionary, dictionary_fallback, + encryptor); } void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 186a27ae9c36..40da66ec8141 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -38,13 +38,11 @@ class EncodedStatistics; class Statistics; class SchemaDescriptor; -#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData; class InternalFileDecryptor; class Decryptor; class Encryptor; class FooterSigningEncryptor; -#endif namespace schema { @@ -128,17 +126,10 @@ class PARQUET_EXPORT ColumnCryptoMetaData { class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor -#ifdef PARQUET_ENCRYPTION static std::unique_ptr Make( const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, int16_t column_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR); -#else - static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, - int16_t column_ordinal = -1); -#endif ~ColumnChunkMetaData(); @@ -169,16 +160,10 @@ class PARQUET_EXPORT ColumnChunkMetaData { #endif private: -#ifdef PARQUET_ENCRYPTION explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR, InternalFileDecryptor* file_decryptor = NULLPTR); -#else - explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - int16_t row_group_ordinal, int16_t column_ordinal, - const ApplicationVersion* writer_version = NULLPTR); -#endif // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -200,14 +185,9 @@ class PARQUET_EXPORT RowGroupMetaData { // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; -#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunk( int i, int16_t row_group_ordinal = -1, InternalFileDecryptor* file_decryptor = NULLPTR) const; -#else - std::unique_ptr ColumnChunk(int i, - int16_t row_group_ordinal = -1) const; -#endif private: explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, @@ -223,14 +203,9 @@ class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor -#ifdef PARQUET_ENCRYPTION static std::shared_ptr Make( const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); -#else - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); -#endif ~FileMetaData(); #ifdef PARQUET_ENCRYPTION @@ -261,12 +236,8 @@ class PARQUET_EXPORT FileMetaData { const std::string& footer_signing_key_metadata() const; #endif -#ifdef PARQUET_ENCRYPTION void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; -#else - void WriteTo(::arrow::io::OutputStream* dst) const; -#endif // Return const-pointer to make it clear that this object is not to be copied const SchemaDescriptor* schema() const; @@ -282,12 +253,8 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; -#ifdef PARQUET_ENCRYPTION explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const std::shared_ptr& decryptor = NULLPTR); -#else - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); -#endif // PIMPL Idiom FileMetaData(); @@ -343,18 +310,11 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { int64_t total_compressed_size() const; // commit the metadata -#ifdef PARQUET_ENCRYPTION void Finish(int64_t num_values, int64_t dictonary_page_offset, int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, bool dictionary_fallback, const std::shared_ptr& encryptor = NULLPTR); -#else - void Finish(int64_t num_values, int64_t dictonary_page_offset, - int64_t index_page_offset, int64_t data_page_offset, - int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); -#endif // The metadata contents, suitable for passing to ColumnChunkMetaData::Make const void* contents() const; diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a1cb1b134500..9a6256d0e372 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -28,6 +28,12 @@ #ifdef PARQUET_ENCRYPTION #include "parquet/encryption.h" +#else +namespace parquet { +class FileEncryptionProperties; +class FileDecryptionProperties; +class ColumnEncryptionProperties; +} // namespace parquet #endif #include "parquet/exception.h" @@ -83,10 +89,7 @@ class PARQUET_EXPORT ReaderProperties { MemoryPool* pool_; int64_t buffer_size_; bool buffered_stream_enabled_; - -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_decryption_properties_; -#endif }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -344,13 +347,10 @@ class PARQUET_EXPORT WriterProperties { for (const auto& item : statistics_enabled_) get(item.first).set_statistics_enabled(item.second); - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, -#ifdef PARQUET_ENCRYPTION - std::move(file_encryption_properties_), -#endif - default_column_properties_, column_properties)); + return std::shared_ptr(new WriterProperties( + pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, + pagesize_, version_, created_by_, std::move(file_encryption_properties_), + default_column_properties_, column_properties)); } private: @@ -362,9 +362,7 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type version_; std::string created_by_; -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; -#endif // Settings used for each column unless overridden in any of the maps below ColumnProperties default_column_properties_; @@ -451,9 +449,7 @@ class PARQUET_EXPORT WriterProperties { MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, const std::string& created_by, -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties, -#endif const ColumnProperties& default_column_properties, const std::unordered_map& column_properties) : pool_(pool), @@ -463,12 +459,9 @@ class PARQUET_EXPORT WriterProperties { pagesize_(pagesize), parquet_version_(version), parquet_created_by_(created_by), -#ifdef PARQUET_ENCRYPTION file_encryption_properties_(file_encryption_properties), -#endif default_column_properties_(default_column_properties), - column_properties_(column_properties) { - } + column_properties_(column_properties) {} MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -478,9 +471,7 @@ class PARQUET_EXPORT WriterProperties { ParquetVersion::type parquet_version_; std::string parquet_created_by_; -#ifdef PARQUET_ENCRYPTION std::shared_ptr file_encryption_properties_; -#endif ColumnProperties default_column_properties_; std::unordered_map column_properties_; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 10c202a2f2d9..b051df3480c2 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -54,6 +54,11 @@ #include "parquet/encryption_internal.h" #include "parquet/internal_file_decryptor.h" #include "parquet/internal_file_encryptor.h" +#else +namespace parquet { +class Encryptor; +class Decryptor; +} // namespace parquet #endif namespace parquet { @@ -225,12 +230,8 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg -#ifdef PARQUET_ENCRYPTION - , - const std::shared_ptr& decryptor = NULLPTR -#endif -) { +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, + const std::shared_ptr& decryptor = NULLPTR) { #ifdef PARQUET_ENCRYPTION // thrift message is not encrypted if (decryptor == NULLPTR) { @@ -286,12 +287,8 @@ class ThriftSerializer { } template - int64_t Serialize(const T* obj, ArrowOutputStream* out -#ifdef PARQUET_ENCRYPTION - , - const std::shared_ptr& encryptor = NULLPTR -#endif - ) { + int64_t Serialize(const T* obj, ArrowOutputStream* out, + const std::shared_ptr& encryptor = NULLPTR) { uint8_t* out_buffer; uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); From 36cd316299d3f2c380f74ef01657721399ad2231 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Tue, 25 Jun 2019 11:03:20 +0700 Subject: [PATCH 117/201] add PARQUET_EXPORT into Builder class of encryption properties --- cpp/src/parquet/encryption.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index f12db27dbec7..f4a66ce81fee 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -97,7 +97,7 @@ inline uint8_t* str2bytes(const std::string& str) { class PARQUET_EXPORT ColumnEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// Convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) { @@ -188,7 +188,7 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT ColumnDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: /// convenience builder for regular (not nested) columns. explicit Builder(const std::string& name) @@ -255,7 +255,7 @@ class PARQUET_EXPORT AADPrefixVerifier { class PARQUET_EXPORT FileDecryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: Builder() { check_plaintext_footer_integrity_ = kDefaultCheckSignature; @@ -400,7 +400,7 @@ class PARQUET_EXPORT FileDecryptionProperties { class PARQUET_EXPORT FileEncryptionProperties { public: - class Builder { + class PARQUET_EXPORT Builder { public: explicit Builder(const std::string& footer_key) : parquet_cipher_(kDefaultEncryptionAlgorithm), From e5a771a7d8df19b89f685987ed4af8629d3ca73c Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Tue, 25 Jun 2019 14:45:45 +0300 Subject: [PATCH 118/201] Change assert to ASSERT_EQ in encryption-configurations-test.cc --- .../parquet/encryption-configurations-test.cc | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index b45c47f365bd..c913f7917c00 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -294,11 +294,11 @@ class TestEncryptionConfiguration : public ::testing::Test { // Get the number of RowGroups int num_row_groups = file_metadata->num_row_groups(); - assert(num_row_groups == 1); + ASSERT_EQ(num_row_groups, 1); // Get the number of Columns int num_columns = file_metadata->num_columns(); - assert(num_columns == 8); + ASSERT_EQ(num_columns, 8); // Iterate over all the RowGroups in the file for (int r = 0; r < num_row_groups; ++r) { @@ -326,12 +326,12 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written bool expected_value = ((i % 2) == 0) ? true : false; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); i++; } // Get the Column Reader for the Int32 column @@ -346,11 +346,11 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written - assert(value == i); + ASSERT_EQ(value, i); i++; } // Get the Column Reader for the Int64 column @@ -366,17 +366,17 @@ class TestEncryptionConfiguration : public ::testing::Test { rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written int64_t expected_value = i * 1000 * 1000; expected_value *= 1000 * 1000; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); if ((i % 2) == 0) { - assert(repetition_level == 1); + ASSERT_EQ(repetition_level, 1); } else { - assert(repetition_level == 0); + ASSERT_EQ(repetition_level, 0); } i++; } @@ -393,16 +393,16 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written parquet::Int96 expected_value; expected_value.value[0] = i; expected_value.value[1] = i + 1; expected_value.value[2] = i + 2; for (int j = 0; j < 3; j++) { - assert(value.value[j] == expected_value.value[j]); + ASSERT_EQ(value.value[j], expected_value.value[j]); } i++; } @@ -419,12 +419,12 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written float expected_value = static_cast(i) * 1.1f; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); i++; } // Get the Column Reader for the Double column @@ -439,12 +439,12 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written double expected_value = i * 1.1111111; - assert(value == expected_value); + ASSERT_EQ(value, expected_value); i++; } // Get the Column Reader for the ByteArray column @@ -460,7 +460,7 @@ class TestEncryptionConfiguration : public ::testing::Test { rows_read = ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // Verify the value written char expected_value[kFixedLength] = "parquet"; expected_value[7] = static_cast('0' + i / 100); @@ -468,14 +468,14 @@ class TestEncryptionConfiguration : public ::testing::Test { expected_value[9] = static_cast('0' + i % 10); if (i % 2 == 0) { // only alternate values exist // There are no NULL values in the rows written - assert(values_read == 1); - assert(value.len == kFixedLength); - assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); - assert(definition_level == 1); + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); } else { // There are NULL values in the rows written - assert(values_read == 0); - assert(definition_level == 0); + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); } i++; } @@ -491,13 +491,13 @@ class TestEncryptionConfiguration : public ::testing::Test { // contains the number of non-null rows rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); // Ensure only one value is read - assert(rows_read == 1); + ASSERT_EQ(rows_read, 1); // There are no NULL values in the rows written - assert(values_read == 1); + ASSERT_EQ(values_read, 1); // Verify the value written char v = static_cast(i); char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - assert(memcmp(value.ptr, &expected_value[0], kFixedLength) == 0); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); i++; } } From 0458d0ddb54cd5dc85cdfefff3015c7f7501a731 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 25 Jun 2019 22:45:54 +0700 Subject: [PATCH 119/201] fix cmake format --- cpp/src/parquet/CMakeLists.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index a493966bf685..1a1472e3dd81 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -180,14 +180,14 @@ set(PARQUET_SRCS statistics.cc types.cc) -if (ARROW_USE_OPENSSL) +if(ARROW_USE_OPENSSL) add_definitions(-DPARQUET_ENCRYPTION) set(PARQUET_SRCS - ${PARQUET_SRCS} - encryption.cc - encryption_internal.cc - internal_file_decryptor.cc - internal_file_encryptor.cc) + ${PARQUET_SRCS} + encryption.cc + encryption_internal.cc + internal_file_decryptor.cc + internal_file_encryptor.cc) endif() # Ensure that thrift compilation is done before using its generated headers @@ -328,10 +328,8 @@ add_parquet_test(arrow-test test_util.cc) if(ARROW_USE_OPENSSL) - add_parquet_test(encryption-test - SOURCES - encryption-configurations-test.cc - encryption-properties-test.cc) + add_parquet_test(encryption-test SOURCES encryption-properties-test.cc + encryption-configurations-test.cc) endif() # Those tests need to use static linking as they access thrift-generated From 2a2a50cded11b306e54d98557cd2f9bc6c736a00 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 26 Jun 2019 10:37:46 +0300 Subject: [PATCH 120/201] Add MemoryPool field to Decryptors/Encryptors --- .../parquet/encryption-configurations-test.cc | 6 ++-- cpp/src/parquet/file_reader.cc | 30 +++++++++---------- cpp/src/parquet/file_writer.cc | 3 +- cpp/src/parquet/internal_file_decryptor.cc | 27 ++++++++++------- cpp/src/parquet/internal_file_decryptor.h | 10 +++++-- cpp/src/parquet/internal_file_encryptor.cc | 22 +++++++++----- cpp/src/parquet/internal_file_encryptor.h | 9 ++++-- cpp/src/parquet/thrift.h | 4 +-- 8 files changed, 68 insertions(+), 43 deletions(-) diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index c913f7917c00..7c79fd14c9d6 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -70,12 +70,12 @@ namespace parquet { -using schema::GroupNode; -using schema::NodePtr; -using schema::PrimitiveNode; using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; constexpr int kFixedLength = 10; diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index d751f46338a9..02c80386429b 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -65,9 +65,9 @@ RowGroupReader::RowGroupReader(std::unique_ptr contents) : contents_(std::move(contents)) {} std::shared_ptr RowGroupReader::Column(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; const ColumnDescriptor* descr = metadata()->schema()->Column(i); std::unique_ptr page_reader = contents_->GetColumnPageReader(i); @@ -77,9 +77,9 @@ std::shared_ptr RowGroupReader::Column(int i) { } std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { - DCHECK(i < metadata()->num_columns()) << "The RowGroup only has " - << metadata()->num_columns() - << "columns, requested column: " << i; + DCHECK(i < metadata()->num_columns()) + << "The RowGroup only has " << metadata()->num_columns() + << "columns, requested column: " << i; return contents_->GetColumnPageReader(i); } @@ -395,9 +395,9 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( // Handle AAD prefix EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_.reset(new InternalFileDecryptor(file_decryption_properties, file_aad, - algo.algorithm, - file_crypto_metadata->key_metadata())); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_crypto_metadata->key_metadata(), properties_.memory_pool())); int64_t metadata_offset = file_size - kFooterSize - footer_len + crypto_metadata_len; uint32_t metadata_len = footer_len - crypto_metadata_len; std::shared_ptr metadata_buffer; @@ -423,9 +423,9 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); // Handle AAD prefix std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_.reset( - new InternalFileDecryptor(file_decryption_properties, file_aad, algo.algorithm, - file_metadata_->footer_signing_key_metadata())); + file_decryptor_.reset(new InternalFileDecryptor( + file_decryption_properties, file_aad, algo.algorithm, + file_metadata_->footer_signing_key_metadata(), properties_.memory_pool())); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != 28) { @@ -569,9 +569,9 @@ std::shared_ptr ParquetFileReader::metadata() const { } std::shared_ptr ParquetFileReader::RowGroup(int i) { - DCHECK(i < metadata()->num_row_groups()) << "The file only has " - << metadata()->num_row_groups() - << "row groups, requested reader for: " << i; + DCHECK(i < metadata()->num_row_groups()) + << "The file only has " << metadata()->num_row_groups() + << "row groups, requested reader for: " << i; return contents_->GetRowGroup(i); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d5d7abacfc4d..f4e6426a47a0 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -429,7 +429,8 @@ class FileSerializer : public ParquetFileWriter::Contents { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { - file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties)); + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, + properties_->memory_pool())); if (file_encryption_properties->encrypted_footer()) { PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4)); } else { diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 867c81a0113f..9af59ae01e31 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -46,8 +46,13 @@ int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int foote // Decryptor Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, - const std::string& file_aad, const std::string& aad) - : aes_decryptor_(aes_decryptor), key_(key), file_aad_(file_aad), aad_(aad) {} + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_decryptor_(aes_decryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); } @@ -62,11 +67,13 @@ int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len, InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties, const std::string& file_aad, ParquetCipher::type algorithm, - const std::string& footer_key_metadata) + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool) : properties_(properties), file_aad_(file_aad), algorithm_(algorithm), - footer_key_metadata_(footer_key_metadata) { + footer_key_metadata_(footer_key_metadata), + pool_(pool) { if (properties_->is_utilized()) { throw ParquetException( "Re-using decryption properties with explicit keys for another file"); @@ -160,10 +167,10 @@ std::shared_ptr InternalFileDecryptor::GetFooterDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size()); - footer_metadata_decryptor_ = - std::make_shared(aes_metadata_decryptor, footer_key, file_aad_, aad); + footer_metadata_decryptor_ = std::make_shared( + aes_metadata_decryptor, footer_key, file_aad_, aad, pool_); footer_data_decryptor_ = - std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, footer_key, file_aad_, aad, pool_); if (metadata) return footer_metadata_decryptor_; return footer_data_decryptor_; @@ -219,10 +226,10 @@ std::shared_ptr InternalFileDecryptor::GetColumnDecryptor( auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size()); auto aes_data_decryptor = GetDataAesDecryptor(column_key.size()); - column_metadata_map_[column_path] = - std::make_shared(aes_metadata_decryptor, column_key, file_aad_, aad); + column_metadata_map_[column_path] = std::make_shared( + aes_metadata_decryptor, column_key, file_aad_, aad, pool_); column_data_map_[column_path] = - std::make_shared(aes_data_decryptor, column_key, file_aad_, aad); + std::make_shared(aes_data_decryptor, column_key, file_aad_, aad, pool_); if (metadata) return column_metadata_map_[column_path]; return column_data_map_[column_path]; diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 842eea7c6808..760337003296 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -53,10 +53,12 @@ class FooterSigningEncryptor { class Decryptor { public: Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, - const std::string& file_aad, const std::string& aad); + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); const std::string& file_aad() const { return file_aad_; } void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } int CiphertextSizeDelta(); int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext); @@ -66,6 +68,7 @@ class Decryptor { std::string key_; std::string file_aad_; std::string aad_; + ::arrow::MemoryPool* pool_; }; class InternalFileDecryptor { @@ -73,7 +76,8 @@ class InternalFileDecryptor { explicit InternalFileDecryptor(FileDecryptionProperties* properties, const std::string& file_aad, ParquetCipher::type algorithm, - const std::string& footer_key_metadata); + const std::string& footer_key_metadata, + ::arrow::MemoryPool* pool); std::string& file_aad() { return file_aad_; } @@ -120,6 +124,8 @@ class InternalFileDecryptor { std::unique_ptr meta_decryptor_[3]; std::unique_ptr data_decryptor_[3]; + ::arrow::MemoryPool* pool_; + std::shared_ptr GetFooterDecryptor(const std::string& aad, bool metadata); std::shared_ptr GetColumnDecryptor( std::shared_ptr column_path, diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 6f097a1a0b4d..63a84557c9d7 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -23,8 +23,13 @@ namespace parquet { // Encryptor Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, - const std::string& file_aad, const std::string& aad) - : aes_encryptor_(aes_encryptor), key_(key), file_aad_(file_aad), aad_(aad) {} + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool) + : aes_encryptor_(aes_encryptor), + key_(key), + file_aad_(file_aad), + aad_(aad), + pool_(pool) {} int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); } @@ -35,8 +40,9 @@ int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* cip } // InternalFileEncryptor -InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties) - : properties_(properties) { +InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties, + ::arrow::MemoryPool* pool) + : properties_(properties), pool_(pool) { if (properties_->is_utilized()) { throw ParquetException("Re-using encryption properties for another file"); } @@ -60,8 +66,8 @@ std::shared_ptr InternalFileEncryptor::GetFooterEncryptor() { std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad()); std::string footer_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size()); - footer_encryptor_ = std::make_shared(aes_encryptor, footer_key, - properties_->file_aad(), footer_aad); + footer_encryptor_ = std::make_shared( + aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_); return footer_encryptor_; } @@ -75,7 +81,7 @@ std::shared_ptr InternalFileEncryptor::GetFooterSigningEncryptor() { std::string footer_signing_key = properties_->footer_key(); auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size()); footer_signing_encryptor_ = std::make_shared( - aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad); + aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_); return footer_signing_encryptor_; } @@ -120,7 +126,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( std::string file_aad = properties_->file_aad(); std::shared_ptr encryptor = - std::make_shared(aes_encryptor, key, file_aad, ""); + std::make_shared(aes_encryptor, key, file_aad, "", pool_); if (metadata) column_metadata_map_[column_path] = encryptor; else diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index efef532b1639..e75a582e4a41 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -38,9 +38,11 @@ class ColumnEncryptionProperties; class Encryptor { public: Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, - const std::string& file_aad, const std::string& aad); + const std::string& file_aad, const std::string& aad, + ::arrow::MemoryPool* pool); const std::string& file_aad() { return file_aad_; } void UpdateAad(const std::string& aad) { aad_ = aad; } + ::arrow::MemoryPool* pool() { return pool_; } int CiphertextSizeDelta(); int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext); @@ -62,11 +64,12 @@ class Encryptor { std::string key_; std::string file_aad_; std::string aad_; + ::arrow::MemoryPool* pool_; }; class InternalFileEncryptor { public: - explicit InternalFileEncryptor(FileEncryptionProperties* propperties); + explicit InternalFileEncryptor(FileEncryptionProperties* propperties, ::arrow::MemoryPool* pool); std::shared_ptr GetFooterEncryptor(); std::shared_ptr GetFooterSigningEncryptor(); @@ -96,6 +99,8 @@ class InternalFileEncryptor { std::unique_ptr meta_encryptor_[3]; std::unique_ptr data_encryptor_[3]; + ::arrow::MemoryPool* pool_; + std::shared_ptr GetColumnEncryptor( const std::shared_ptr& column_path, bool metadata); diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index b051df3480c2..3d498fc0253a 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -242,7 +242,7 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali // decrypt std::shared_ptr decrypted_buffer = std::static_pointer_cast(AllocateBuffer( - ::arrow::default_memory_pool(), + decryptor->pool(), static_cast(clen - decryptor->CiphertextSizeDelta()))); const uint8_t* cipher_buf = buf; uint32_t decrypted_buffer_len = @@ -326,7 +326,7 @@ class ThriftSerializer { const std::shared_ptr& encryptor) { std::shared_ptr cipher_buffer = std::static_pointer_cast(AllocateBuffer( - ::arrow::default_memory_pool(), + encryptor->pool(), static_cast(encryptor->CiphertextSizeDelta() + out_length))); int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); From f2ff1d742c0a3b97028a685d14ab692b5d8b7b82 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 26 Jun 2019 19:11:59 +0700 Subject: [PATCH 121/201] keep encryption parameters at method declaration (column_writer.cc/.h) --- cpp/src/parquet/column_writer.cc | 66 ++++------------------- cpp/src/parquet/column_writer.h | 9 +--- cpp/src/parquet/internal_file_encryptor.h | 3 +- 3 files changed, 14 insertions(+), 64 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 91918638dab9..21a937428581 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -144,13 +144,9 @@ class SerializedPageWriter : public PageWriter { SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, - MemoryPool* pool = arrow::default_memory_pool() -#ifdef PARQUET_ENCRYPTION - , + MemoryPool* pool = arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, - std::shared_ptr data_encryptor = NULLPTR -#endif - ) + std::shared_ptr data_encryptor = NULLPTR) : sink_(sink), metadata_(metadata), pool_(pool), @@ -161,18 +157,14 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_(0), page_ordinal_(0), row_group_ordinal_(row_group_ordinal), - column_ordinal_(column_chunk_ordinal) -#ifdef PARQUET_ENCRYPTION - , + column_ordinal_(column_chunk_ordinal), meta_encryptor_(meta_encryptor), - data_encryptor_(data_encryptor) -#endif - { -#ifdef PARQUET_ENCRYPTION + data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { +#ifdef PARQUET_ENCRYPTION InitEncryption(); - } #endif + } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); } @@ -226,11 +218,9 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { UpdateEncryption(encryption::kDictionaryPageHeader); } +#endif int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); -#else - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); -#endif PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); @@ -247,17 +237,11 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_ != nullptr) { UpdateEncryption(encryption::kColumnMetaData); } - +#endif // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback, meta_encryptor_); -#else - // index_page_offset = -1 since they are not supported - metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, - total_compressed_size_, total_uncompressed_size_, has_dictionary, - fallback); -#endif // Write metadata at end of column chunk metadata_->WriteTo(sink_.get()); } @@ -327,11 +311,9 @@ class SerializedPageWriter : public PageWriter { if (meta_encryptor_) { UpdateEncryption(encryption::kDataPageHeader); } +#endif int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); -#else - int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get()); -#endif PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); total_uncompressed_size_ += uncompressed_size + header_size; @@ -428,16 +410,15 @@ class SerializedPageWriter : public PageWriter { #ifdef PARQUET_ENCRYPTION std::string data_pageAAD_; std::string data_page_headerAAD_; +#endif std::shared_ptr meta_encryptor_; std::shared_ptr data_encryptor_; -#endif }; // This implementation of the PageWriter writes to the final sink on Close . class BufferedPageWriter : public PageWriter { public: -#ifdef PARQUET_ENCRYPTION BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, @@ -445,24 +426,12 @@ class BufferedPageWriter : public PageWriter { MemoryPool* pool = arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) -#else - BufferedPageWriter(const std::shared_ptr& sink, - Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - int16_t row_group_ordinal, int16_t current_column_ordinal, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) -#endif : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); -#ifdef PARQUET_ENCRYPTION pager_ = std::unique_ptr(new SerializedPageWriter( in_memory_sink_, codec, metadata, row_group_ordinal, current_column_ordinal, pool, meta_encryptor, data_encryptor)); -#else - pager_ = std::unique_ptr( - new SerializedPageWriter(in_memory_sink_, codec, metadata, row_group_ordinal, - current_column_ordinal, pool)); -#endif } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -507,13 +476,9 @@ class BufferedPageWriter : public PageWriter { std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group -#ifdef PARQUET_ENCRYPTION - , + int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group, std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor -#endif ) { -#ifdef PARQUET_ENCRYPTION if (buffered_row_group) { return std::unique_ptr(new BufferedPageWriter( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, @@ -523,15 +488,6 @@ std::unique_ptr PageWriter::Open( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, meta_encryptor, data_encryptor)); } -#else - if (buffered_row_group) { - return std::unique_ptr(new BufferedPageWriter( - sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); - } else { - return std::unique_ptr(new SerializedPageWriter( - sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool)); - } -#endif } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 3c662dbe6683..d6100c4824d1 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -46,11 +46,8 @@ class ColumnDescriptor; class CompressedDataPage; class DictionaryPage; class ColumnChunkMetaDataBuilder; -class WriterProperties; - -#ifdef PARQUET_ENCRYPTION class Encryptor; -#endif +class WriterProperties; class PARQUET_EXPORT LevelEncoder { public: @@ -91,13 +88,9 @@ class PARQUET_EXPORT PageWriter { ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), -#ifdef PARQUET_ENCRYPTION bool buffered_row_group = false, std::shared_ptr header_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR); -#else - bool buffered_row_group = false); -#endif // The Column Writer decides if dictionary encoding is used if set and // if the dictionary encoding has fallen back to default encoding on reaching dictionary diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index e75a582e4a41..7d2ce7f4f128 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -69,7 +69,8 @@ class Encryptor { class InternalFileEncryptor { public: - explicit InternalFileEncryptor(FileEncryptionProperties* propperties, ::arrow::MemoryPool* pool); + explicit InternalFileEncryptor(FileEncryptionProperties* propperties, + ::arrow::MemoryPool* pool); std::shared_ptr GetFooterEncryptor(); std::shared_ptr GetFooterSigningEncryptor(); From 7bc5635bb8e893dc7ea8683ee948fd5c364d2ab7 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 27 Jun 2019 12:14:30 +0300 Subject: [PATCH 122/201] Write to parquet stream to file in encryption test --- ...yption-reader-writer-all-crypto-options.cc | 3 +- cpp/src/parquet/CMakeLists.txt | 3 +- .../parquet/encryption-configurations-test.cc | 138 ++++++++++-------- cpp/src/parquet/encryption.h | 2 +- 4 files changed, 86 insertions(+), 60 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 069f997d1ba5..346cd2e5b57c 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -94,6 +94,8 @@ const std::string kColumnEncryptionKey1 = "1234567890123450"; const std::string kColumnEncryptionKey2 = "1234567890123451"; const std::string fileName = "tester"; +using FileClass = ::arrow::io::FileOutputStream; + void PrintDecryptionConfiguration(int configuration); // Check that the decryption result is as expected. void CheckResult(std::string file, int example_id, std::string exception_msg); @@ -260,7 +262,6 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { std::string test_number_string = ss.str(); try { // Create a local file output stream instance. - using FileClass = ::arrow::io::FileOutputStream; std::shared_ptr out_file; std::string file = root_path + fileName + std::string(test_number_string) + ".parquet.encrypted"; diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 1a1472e3dd81..89e737e278fe 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -329,7 +329,8 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES encryption-properties-test.cc - encryption-configurations-test.cc) + encryption-configurations-test.cc + test-util.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index 7c79fd14c9d6..223738652d9b 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -17,6 +17,10 @@ #include +#include + +#include + #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/file_reader.h" @@ -69,6 +73,15 @@ */ namespace parquet { +namespace test { +std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} + +using FileClass = ::arrow::io::FileOutputStream; using parquet::ConvertedType; using parquet::Repetition; @@ -79,8 +92,6 @@ using schema::PrimitiveNode; constexpr int kFixedLength = 10; -namespace test { - const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 const char kColumnEncryptionKey1[] = "1234567890123450"; const char kColumnEncryptionKey2[] = "1234567890123451"; @@ -89,18 +100,25 @@ const char kFileName[] = "tester"; class TestEncryptionConfiguration : public ::testing::Test { public: void SetUp() { - rows_per_rowgroup_ = 50; + createDecryptionConfigurations(); // Setup the parquet schema schema_ = SetupEncryptionSchema(); - createDecryptionConfigurations(); - path_to_double_field_ = parquet::schema::ColumnPath::FromDotString("double_field"); - path_to_float_field_ = parquet::schema::ColumnPath::FromDotString("float_field"); + std::string res = "test.parquet.encrypted"; + file_name_ = data_file(res.c_str()); + } + + void TearDown() { + // delete test file. + ASSERT_EQ(std::remove(file_name_.c_str()), 0); } protected: - std::shared_ptr path_to_double_field_; - std::shared_ptr path_to_float_field_; - int rows_per_rowgroup_; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::string file_name_; + int rows_per_rowgroup_ = 50; std::shared_ptr schema_; // This vector will hold various decryption configurations. std::vector> @@ -169,17 +187,21 @@ class TestEncryptionConfiguration : public ::testing::Test { ->build()); } - std::shared_ptr EncryptFile( - std::shared_ptr encryption_configurations) { - auto sink = CreateOutputStream(); + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file) { + std::shared_ptr out_file; WriterProperties::Builder prop_builder; - prop_builder.compression(parquet::Compression::SNAPPY); prop_builder.encryption(encryption_configurations); std::shared_ptr writer_properties = prop_builder.build(); - auto file_writer = ParquetFileWriter::Open(sink, schema_, writer_properties); + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + RowGroupWriter* row_group_writer; row_group_writer = file_writer->AppendRowGroup(); @@ -273,21 +295,23 @@ class TestEncryptionConfiguration : public ::testing::Test { // Close the ParquetFileWriter file_writer->Close(); - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(sink->Finish(&buffer)); - return buffer; + // Close the ParquetFileWriter + file_writer->Close(); + + // Write the bytes to file + out_file->Close().ok(); + return; } - void DecryptFile(std::shared_ptr buffer, int example_id, - int encryption_configuration) { + void DecryptFile(std::string file, int example_id, int encryption_configuration) { std::string exception_msg; try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); reader_properties.file_decryption_properties( vector_of_decryption_configurations_[example_id]->DeepClone()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source, reader_properties); + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); // Get the File MetaData std::shared_ptr file_metadata = file_reader->metadata(); @@ -583,14 +607,14 @@ TEST_F(TestEncryptionConfiguration, UniformEncryption) { parquet::FileEncryptionProperties::Builder file_encryption_builder_1( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build()); + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 1 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 1 /* encryption_configuration_number */); } } @@ -617,23 +641,23 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { parquet::FileEncryptionProperties::Builder file_encryption_builder_2( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) - ->build()); + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 2 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); } } +// Encryption configuration 3: Encrypt two columns, with different keys. +// Don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { - // Encryption configuration 3: Encrypt two columns, with different keys. - // Don’t encrypt footer. - // (plaintext footer mode, readable by legacy readers) std::map, std::shared_ptr, parquet::schema::ColumnPath::CmpColumnPath> @@ -650,17 +674,17 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { parquet::FileEncryptionProperties::Builder file_encryption_builder_3( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) - ->set_plaintext_footer() - ->build()); + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 3 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 3 /* encryption_configuration_number */); } } @@ -683,16 +707,16 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { parquet::FileEncryptionProperties::Builder file_encryption_builder_4( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) - ->aad_prefix(kFileName_) - ->build()); + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(kFileName_) + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 4 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 4 /* encryption_configuration_number */); } } @@ -716,17 +740,17 @@ TEST_F(TestEncryptionConfiguration, parquet::FileEncryptionProperties::Builder file_encryption_builder_5( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) - ->footer_key_metadata("kf") - ->aad_prefix(kFileName_) - ->disable_store_aad_prefix_storage() - ->build()); + this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_store_aad_prefix_storage() + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 5 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 5 /* encryption_configuration_number */); } } @@ -749,17 +773,17 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { parquet::FileEncryptionProperties::Builder file_encryption_builder_6( kFooterEncryptionKey_); - std::shared_ptr buffer = - this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) - ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) - ->build()); + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build(), + file_name_); // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); ++example_id) { - DecryptFile(buffer, example_id, 6 /* encryption_configuration_number */); + DecryptFile(file_name_, example_id, 6 /* encryption_configuration_number */); } } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index f4a66ce81fee..4958faac2aec 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -89,7 +89,7 @@ class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { }; inline uint8_t* str2bytes(const std::string& str) { - if (str.empty()) return nullptr; + if (str.empty()) return NULLPTR; char* cbytes = const_cast(str.c_str()); return reinterpret_cast(cbytes); From 5f7503c7ddaeb7137aff4544666ecc4d3f0e82ae Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Mon, 1 Jul 2019 12:48:55 +0300 Subject: [PATCH 123/201] Add file reader and file writer Close to encryption-configurations-test.cc --- .../encryption-reader-writer-all-crypto-options.cc | 3 +-- cpp/src/parquet/CMakeLists.txt | 4 +++- cpp/src/parquet/encryption-configurations-test.cc | 9 ++------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 346cd2e5b57c..06d43be8f5a0 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -369,8 +369,7 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { vector_of_decryption_configurations.push_back( file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); - // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply - // aad_prefix. + // Decryption configuration 3: Decrypt using explicit column and footer keys. std::shared_ptr path_float_ptr = parquet::schema::ColumnPath::FromDotString("float_field"); std::shared_ptr path_double_ptr = diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 89e737e278fe..a151a52b6276 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -328,7 +328,9 @@ add_parquet_test(arrow-test test_util.cc) if(ARROW_USE_OPENSSL) - add_parquet_test(encryption-test SOURCES encryption-properties-test.cc + add_parquet_test(encryption-test + SOURCES + encryption-properties-test.cc encryption-configurations-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc index 223738652d9b..bcdc7eff4466 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -161,8 +161,7 @@ class TestEncryptionConfiguration : public ::testing::Test { vector_of_decryption_configurations_.push_back( file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); - // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply - // aad_prefix. + // Decryption configuration 3: Decrypt using explicit column and footer keys. std::shared_ptr path_float_ptr = parquet::schema::ColumnPath::FromDotString("float_field"); std::shared_ptr path_double_ptr = @@ -295,11 +294,6 @@ class TestEncryptionConfiguration : public ::testing::Test { // Close the ParquetFileWriter file_writer->Close(); - // Close the ParquetFileWriter - file_writer->Close(); - - // Write the bytes to file - out_file->Close().ok(); return; } @@ -524,6 +518,7 @@ class TestEncryptionConfiguration : public ::testing::Test { ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); i++; } + file_reader->Close(); } } catch (const std::exception& e) { exception_msg = e.what(); From 79372c65f7e5be73ba9600879398f185074aad11 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 4 Jul 2019 07:28:40 +0300 Subject: [PATCH 124/201] Change encryption-configuration-test --- cpp/src/parquet/CMakeLists.txt | 3 +- ...figuration-encrypted-columns-and-footer.cc | 556 ++++++++++++++++++ ...ion-encrypted-columns-plaintext-footer.cc} | 255 +------- 3 files changed, 570 insertions(+), 244 deletions(-) create mode 100644 cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc rename cpp/src/parquet/{encryption-configurations-test.cc => encryption-configuration-encrypted-columns-plaintext-footer.cc} (63%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index a151a52b6276..ffd18b062d80 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -330,8 +330,9 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES + encryption-configuration-encrypted-columns-and-footer.cc + encryption-configuration-encrypted-columns-plaintext-footer.cc encryption-properties-test.cc - encryption-configurations-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc new file mode 100644 index 000000000000..fd0e98ad82c9 --- /dev/null +++ b/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc @@ -0,0 +1,556 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-util.h" + +/* + * This file contains unit-test for writing and reading encrypted Parquet file with + * different encryption and decryption configuration. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The unit-test creates a single parquet file with eight columns using the + * following encryption configuration: + * + * - Encryption configuration : Encrypt two columns and the footer, with different + * keys. + * + * The written parquet file produced above is read by the following decryption + * configurations: + * + * - Decryption configuration : Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + */ + +namespace parquet { +namespace test { + +using FileClass = ::arrow::io::FileOutputStream; + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +constexpr int kFixedLength = 10; + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + createDecryptionConfigurations(); + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + std::string res = "test.parquet.encrypted"; + file_name_ = data_file(res.c_str()); + } + + void TearDown() { + // delete test file. + ASSERT_EQ(std::remove(file_name_.c_str()), 0); + } + + protected: + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::string file_name_; + int rows_per_rowgroup_ = 50; + std::shared_ptr schema_; + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); + } + + void createDecryptionConfigurations() { + /********************************************************************************** + Creating Decryption configuration + **********************************************************************************/ + + // Decryption configuration : Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + } + + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file) { + std::shared_ptr out_file; + + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + return; + } + + void DecryptFile(std::string file, int example_id, int encryption_configuration) { + std::string exception_msg; + try { + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[example_id]->DeepClone()); + + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + ASSERT_EQ(num_row_groups, 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + ASSERT_EQ(num_columns, 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + ASSERT_EQ(value, i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + ASSERT_EQ(value, expected_value); + if ((i % 2) == 0) { + ASSERT_EQ(repetition_level, 1); + } else { + ASSERT_EQ(repetition_level, 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + ASSERT_EQ(value.value[j], expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); + } else { + // There are NULL values in the rows written + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + i++; + } + file_reader->Close(); + } + } catch (const std::exception& e) { + exception_msg = e.what(); + } + CheckResult(encryption_configuration, example_id, exception_msg); + } + + // Check that the decryption result is as expected. + void CheckResult(int encryption_configuration_number, int example_id, + std::string exception_msg) { + if (!exception_msg.empty()) { + ASSERT_EQ(1, 0); + } + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration : Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build(), + file_name_); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); + } +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc similarity index 63% rename from cpp/src/parquet/encryption-configurations-test.cc rename to cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc index bcdc7eff4466..5b10a013e7b7 100644 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ b/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc @@ -29,47 +29,25 @@ #include "parquet/test-util.h" /* - * This file contains unit-tests for writing and reading encrypted Parquet files with - * different encryption and decryption configurations. - * - * Each unit-test produces a single parquet file, encrypted with one of the encryption - * configuration described below; and is read multiple times using a set of decryption - * configurations, also described below. + * This file contains unit-test for writing and reading encrypted Parquet file with + * different encryption and decryption configuration. * * A detailed description of the Parquet Modular Encryption specification can be found * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * Each unit-test creates a single parquet file with eight columns using one of the - * following encryption configurations: + * The unit-test creates a single parquet file with eight columns using the + * following encryption configuration: * - * - Encryption configuration 1: Encrypt all columns and the footer with the same key. - * (uniform encryption) - * - Encryption configuration 2: Encrypt two columns and the footer, with different - * keys. - * - Encryption configuration 3: Encrypt two columns, with different keys. - * Don’t encrypt footer (to enable legacy readers) - * - plaintext footer mode. - * - Encryption configuration 4: Encrypt two columns and the footer, with different - * keys. Supply aad_prefix for file identity - * verification. - * - Encryption configuration 5: Encrypt two columns and the footer, with different - * keys. Supply aad_prefix, and call - * disable_aad_prefix_storage to prevent file - * identity storage in file metadata. - * - Encryption configuration 6: Encrypt two columns and the footer, with different - * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + * - Encryption configuration : Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. * * The written parquet file produced above is read by each of the following decryption * configurations: * - * - Decryption configuration 1: Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. - * - Decryption configuration 2: Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. Supplies - * aad_prefix to verify file identity. - * - Decryption configuration 3: Decrypt using explicit column and footer keys - * (instead of key retrieval callback). + * - Decryption configuration : Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. */ namespace parquet { @@ -130,10 +108,10 @@ class TestEncryptionConfiguration : public ::testing::Test { void createDecryptionConfigurations() { /********************************************************************************** - Creating a number of Decryption configurations + Creating Decryption configuration **********************************************************************************/ - // Decryption configuration 1: Decrypt using key retriever callback that holds the + // Decryption configuration: Decrypt using key retriever callback that holds the // keys of two encrypted columns and the footer key. std::shared_ptr string_kr1 = std::make_shared(); @@ -147,43 +125,6 @@ class TestEncryptionConfiguration : public ::testing::Test { vector_of_decryption_configurations_.push_back( file_decryption_builder_1.key_retriever(kr1)->build()); - // Decryption configuration 2: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. Supply aad_prefix. - std::shared_ptr string_kr2 = - std::make_shared(); - string_kr2->PutKey("kf", kFooterEncryptionKey_); - string_kr2->PutKey("kc1", kColumnEncryptionKey1_); - string_kr2->PutKey("kc2", kColumnEncryptionKey2_); - std::shared_ptr kr2 = - std::static_pointer_cast(string_kr2); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_2; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); - - // Decryption configuration 3: Decrypt using explicit column and footer keys. - std::shared_ptr path_float_ptr = - parquet::schema::ColumnPath::FromDotString("float_field"); - std::shared_ptr path_double_ptr = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( - path_double_ptr); - parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); - - decryption_cols[path_double_ptr] = - decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); - decryption_cols[path_float_ptr] = - decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_3; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_3.footer_key(kFooterEncryptionKey_) - ->column_properties(decryption_cols) - ->build()); } void EncryptFile( @@ -529,26 +470,6 @@ class TestEncryptionConfiguration : public ::testing::Test { // Check that the decryption result is as expected. void CheckResult(int encryption_configuration_number, int example_id, std::string exception_msg) { - int decryption_configuration_number = example_id + 1; - // Encryption_configuration number five contains aad_prefix and - // disable_aad_prefix_storage. - // An exception is expected to be thrown if the file is not decrypted with aad_prefix. - if (encryption_configuration_number == 5) { - if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { - std::size_t found = exception_msg.find("AAD"); - ASSERT_FALSE(found == std::string::npos); - return; - } - } - // Decryption configuration number two contains aad_prefix. An exception is expected - // to be thrown if the file was not encrypted with the same aad_prefix. - if (decryption_configuration_number == 2) { - if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { - std::size_t found = exception_msg.find("AAD"); - ASSERT_FALSE(found == std::string::npos); - return; - } - } if (!exception_msg.empty()) { ASSERT_EQ(1, 0); } @@ -596,60 +517,7 @@ class TestEncryptionConfiguration : public ::testing::Test { } }; -// Encryption configuration 1: Encrypt all columns and the footer with the same key. -// (uniform encryption) -TEST_F(TestEncryptionConfiguration, UniformEncryption) { - parquet::FileEncryptionProperties::Builder file_encryption_builder_1( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 1 /* encryption_configuration_number */); - } -} - -// Encryption configuration 2: Encrypt two columns and the footer, with different keys. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols2; - std::shared_ptr path_to_double_field_ = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::shared_ptr path_to_float_field_ = - parquet::schema::ColumnPath::FromDotString("float_field"); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( - path_to_float_field_); - encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); - encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); - } -} - -// Encryption configuration 3: Encrypt two columns, with different keys. +// Encryption configuration: Encrypt two columns, with different keys. // Don’t encrypt footer. // (plaintext footer mode, readable by legacy readers) TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { @@ -683,104 +551,5 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { } } -// Encryption configuration 4: Encrypt two columns and the footer, with different keys. -// Use aad_prefix. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols4; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( - path_to_float_field_); - encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); - encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_4( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) - ->aad_prefix(kFileName_) - ->build(), - file_name_); - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 4 /* encryption_configuration_number */); - } -} - -// Encryption configuration 5: Encrypt two columns and the footer, with different keys. -// Use aad_prefix and disable_aad_prefix_storage. -TEST_F(TestEncryptionConfiguration, - EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols5; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( - path_to_float_field_); - encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); - encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_5( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) - ->footer_key_metadata("kf") - ->aad_prefix(kFileName_) - ->disable_store_aad_prefix_storage() - ->build(), - file_name_); - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 5 /* encryption_configuration_number */); - } -} - -// Encryption configuration 6: Encrypt two columns and the footer, with different keys. -// Use AES_GCM_CTR_V1 algorithm. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols6; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( - path_to_float_field_); - encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); - encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_6( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) - ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 6 /* encryption_configuration_number */); - } -} - } // namespace test } // namespace parquet From fe773e17773a148e86cb425b15c892c4a6a2f902 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 4 Jul 2019 08:31:13 +0300 Subject: [PATCH 125/201] Delete encryption-configuration-encrypted-columns-plaintext-footer.cc test --- cpp/src/parquet/CMakeLists.txt | 1 - ...tion-encrypted-columns-plaintext-footer.cc | 555 ------------------ 2 files changed, 556 deletions(-) delete mode 100644 cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index ffd18b062d80..146e82c57627 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -331,7 +331,6 @@ if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES encryption-configuration-encrypted-columns-and-footer.cc - encryption-configuration-encrypted-columns-plaintext-footer.cc encryption-properties-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc b/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc deleted file mode 100644 index 5b10a013e7b7..000000000000 --- a/cpp/src/parquet/encryption-configuration-encrypted-columns-plaintext-footer.cc +++ /dev/null @@ -1,555 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include - -#include - -#include "parquet/column_reader.h" -#include "parquet/column_writer.h" -#include "parquet/file_reader.h" -#include "parquet/file_writer.h" -#include "parquet/platform.h" -#include "parquet/test-util.h" - -/* - * This file contains unit-test for writing and reading encrypted Parquet file with - * different encryption and decryption configuration. - * - * A detailed description of the Parquet Modular Encryption specification can be found - * here: - * https://github.com/apache/parquet-format/blob/encryption/Encryption.md - * - * The unit-test creates a single parquet file with eight columns using the - * following encryption configuration: - * - * - Encryption configuration : Encrypt two columns, with different keys. - * Don’t encrypt footer (to enable legacy readers) - * - plaintext footer mode. - * - * The written parquet file produced above is read by each of the following decryption - * configurations: - * - * - Decryption configuration : Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. - */ - -namespace parquet { -namespace test { -std::string data_file(const char* file) { - std::string dir_string(test::get_data_dir()); - std::stringstream ss; - ss << dir_string << "/" << file; - return ss.str(); -} - -using FileClass = ::arrow::io::FileOutputStream; - -using parquet::ConvertedType; -using parquet::Repetition; -using parquet::Type; -using schema::GroupNode; -using schema::NodePtr; -using schema::PrimitiveNode; - -constexpr int kFixedLength = 10; - -const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 -const char kColumnEncryptionKey1[] = "1234567890123450"; -const char kColumnEncryptionKey2[] = "1234567890123451"; -const char kFileName[] = "tester"; - -class TestEncryptionConfiguration : public ::testing::Test { - public: - void SetUp() { - createDecryptionConfigurations(); - // Setup the parquet schema - schema_ = SetupEncryptionSchema(); - std::string res = "test.parquet.encrypted"; - file_name_ = data_file(res.c_str()); - } - - void TearDown() { - // delete test file. - ASSERT_EQ(std::remove(file_name_.c_str()), 0); - } - - protected: - std::shared_ptr path_to_double_field_ = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::shared_ptr path_to_float_field_ = - parquet::schema::ColumnPath::FromDotString("float_field"); - std::string file_name_; - int rows_per_rowgroup_ = 50; - std::shared_ptr schema_; - // This vector will hold various decryption configurations. - std::vector> - vector_of_decryption_configurations_; - std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); - std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); - std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); - std::string kFileName_ = std::string(kFileName); - - void createDecryptionConfigurations() { - /********************************************************************************** - Creating Decryption configuration - **********************************************************************************/ - - // Decryption configuration: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. - std::shared_ptr string_kr1 = - std::make_shared(); - string_kr1->PutKey("kf", kFooterEncryptionKey_); - string_kr1->PutKey("kc1", kColumnEncryptionKey1_); - string_kr1->PutKey("kc2", kColumnEncryptionKey2_); - std::shared_ptr kr1 = - std::static_pointer_cast(string_kr1); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_1; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_1.key_retriever(kr1)->build()); - - } - - void EncryptFile( - std::shared_ptr encryption_configurations, - std::string file) { - std::shared_ptr out_file; - - WriterProperties::Builder prop_builder; - prop_builder.compression(parquet::Compression::SNAPPY); - prop_builder.encryption(encryption_configurations); - std::shared_ptr writer_properties = prop_builder.build(); - - PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); - - RowGroupWriter* row_group_writer; - row_group_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::ByteArray value; - char hello[kFixedLength] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = kFixedLength; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); - - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Close the ParquetFileWriter - file_writer->Close(); - - return; - } - - void DecryptFile(std::string file, int example_id, int encryption_configuration) { - std::string exception_msg; - try { - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption_properties( - vector_of_decryption_configurations_[example_id]->DeepClone()); - - auto file_reader = - parquet::ParquetFileReader::OpenFile(file, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = file_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - ASSERT_EQ(num_row_groups, 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - ASSERT_EQ(num_columns, 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - file_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - ASSERT_EQ(value, i); - i++; - } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - ASSERT_EQ(value, expected_value); - if ((i % 2) == 0) { - ASSERT_EQ(repetition_level, 1); - } else { - ASSERT_EQ(repetition_level, 0); - } - i++; - } - - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - ASSERT_EQ(value.value[j], expected_value.value[j]); - } - i++; - } - - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - double expected_value = i * 1.1111111; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // Verify the value written - char expected_value[kFixedLength] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - ASSERT_EQ(value.len, kFixedLength); - ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); - ASSERT_EQ(definition_level, 1); - } else { - // There are NULL values in the rows written - ASSERT_EQ(values_read, 0); - ASSERT_EQ(definition_level, 0); - } - i++; - } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - char v = static_cast(i); - char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); - i++; - } - file_reader->Close(); - } - } catch (const std::exception& e) { - exception_msg = e.what(); - } - CheckResult(encryption_configuration, example_id, exception_msg); - } - - // Check that the decryption result is as expected. - void CheckResult(int encryption_configuration_number, int example_id, - std::string exception_msg) { - if (!exception_msg.empty()) { - ASSERT_EQ(1, 0); - } - } - - std::shared_ptr SetupEncryptionSchema() { - parquet::schema::NodeVector fields; - // Create a primitive node named 'boolean_field' with type:BOOLEAN, - // repetition:REQUIRED - fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, ConvertedType::NONE)); - - // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, - // logical type:TIME_MILLIS - fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - ConvertedType::TIME_MILLIS)); - - // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED - fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, - Type::DOUBLE, ConvertedType::NONE)); - - // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL - fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, - Type::BYTE_ARRAY, ConvertedType::NONE)); - - // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, - // repetition:REQUIRED, field_length = kFixedLength - fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, - kFixedLength)); - - // Create a GroupNode named 'schema' using the primitive nodes defined above - // This GroupNode is the root node of the schema tree - return std::static_pointer_cast( - GroupNode::Make("schema", Repetition::REQUIRED, fields)); - } -}; - -// Encryption configuration: Encrypt two columns, with different keys. -// Don’t encrypt footer. -// (plaintext footer mode, readable by legacy readers) -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols3; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( - path_to_float_field_); - encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); - encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_3( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) - ->set_plaintext_footer() - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 3 /* encryption_configuration_number */); - } -} - -} // namespace test -} // namespace parquet From b6f4d228cc6bcd67eed72ce5ad056328c3a19234 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 10 Jul 2019 11:29:40 +0300 Subject: [PATCH 126/201] Remove FooterSigningEncryptor class --- cpp/src/parquet/CMakeLists.txt | 2 +- ...r.cc => encryption-configurations-test.cc} | 273 ++++++++++++++++-- cpp/src/parquet/file_reader.cc | 3 +- cpp/src/parquet/internal_file_decryptor.cc | 34 +-- cpp/src/parquet/internal_file_decryptor.h | 21 +- cpp/src/parquet/metadata.cc | 23 +- cpp/src/parquet/metadata.h | 3 +- 7 files changed, 275 insertions(+), 84 deletions(-) rename cpp/src/parquet/{encryption-configuration-encrypted-columns-and-footer.cc => encryption-configurations-test.cc} (63%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 146e82c57627..fdf40a790fcb 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -330,7 +330,7 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES - encryption-configuration-encrypted-columns-and-footer.cc + encryption-configurations-test.cc encryption-properties-test.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc b/cpp/src/parquet/encryption-configurations-test.cc similarity index 63% rename from cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc rename to cpp/src/parquet/encryption-configurations-test.cc index fd0e98ad82c9..63e7cccff57f 100644 --- a/cpp/src/parquet/encryption-configuration-encrypted-columns-and-footer.cc +++ b/cpp/src/parquet/encryption-configurations-test.cc @@ -29,28 +29,57 @@ #include "parquet/test-util.h" /* - * This file contains unit-test for writing and reading encrypted Parquet file with - * different encryption and decryption configuration. + * This file contains unit-tests for writing and reading encrypted Parquet files with + * different encryption and decryption configurations. + * + * Each unit-test produces a single parquet file, encrypted with one of the encryption + * configuration described below; and is read multiple times using a set of decryption + * configurations, also described below. * * A detailed description of the Parquet Modular Encryption specification can be found * here: * https://github.com/apache/parquet-format/blob/encryption/Encryption.md * - * The unit-test creates a single parquet file with eight columns using the - * following encryption configuration: + * Each unit-test creates a single parquet file with eight columns using one of the + * following encryption configurations: * - * - Encryption configuration : Encrypt two columns and the footer, with different - * keys. + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. * - * The written parquet file produced above is read by the following decryption + * The written parquet file produced above is read by each of the following decryption * configurations: * - * - Decryption configuration : Decrypt using key retriever that holds the keys of + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). */ namespace parquet { namespace test { +std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} using FileClass = ::arrow::io::FileOutputStream; @@ -99,19 +128,12 @@ class TestEncryptionConfiguration : public ::testing::Test { std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); std::string kFileName_ = std::string(kFileName); - std::string data_file(const char* file) { - std::string dir_string(test::get_data_dir()); - std::stringstream ss; - ss << dir_string << "/" << file; - return ss.str(); - } - void createDecryptionConfigurations() { /********************************************************************************** - Creating Decryption configuration + Creating a number of Decryption configurations **********************************************************************************/ - // Decryption configuration : Decrypt using key retriever callback that holds the + // Decryption configuration 1: Decrypt using key retriever callback that holds the // keys of two encrypted columns and the footer key. std::shared_ptr string_kr1 = std::make_shared(); @@ -124,6 +146,45 @@ class TestEncryptionConfiguration : public ::testing::Test { parquet::FileDecryptionProperties::Builder file_decryption_builder_1; vector_of_decryption_configurations_.push_back( file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey_); + string_kr2->PutKey("kc1", kColumnEncryptionKey1_); + string_kr2->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( + path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey_) + ->column_properties(decryption_cols) + ->build()); } void EncryptFile( @@ -239,13 +300,13 @@ class TestEncryptionConfiguration : public ::testing::Test { void DecryptFile(std::string file, int example_id, int encryption_configuration) { std::string exception_msg; + std::unique_ptr file_reader; try { parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); reader_properties.file_decryption_properties( vector_of_decryption_configurations_[example_id]->DeepClone()); - auto file_reader = - parquet::ParquetFileReader::OpenFile(file, false, reader_properties); + file_reader = parquet::ParquetFileReader::OpenFile(file, false, reader_properties); // Get the File MetaData std::shared_ptr file_metadata = file_reader->metadata(); @@ -458,8 +519,8 @@ class TestEncryptionConfiguration : public ::testing::Test { ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); i++; } - file_reader->Close(); } + file_reader->Close(); } catch (const std::exception& e) { exception_msg = e.what(); } @@ -469,6 +530,26 @@ class TestEncryptionConfiguration : public ::testing::Test { // Check that the decryption result is as expected. void CheckResult(int encryption_configuration_number, int example_id, std::string exception_msg) { + int decryption_configuration_number = example_id + 1; + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_configuration_number == 5) { + if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { + std::size_t found = exception_msg.find("AAD"); + ASSERT_FALSE(found == std::string::npos); + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected + // to be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_configuration_number == 2) { + if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { + std::size_t found = exception_msg.find("AAD"); + ASSERT_FALSE(found == std::string::npos); + return; + } + } if (!exception_msg.empty()) { ASSERT_EQ(1, 0); } @@ -516,7 +597,24 @@ class TestEncryptionConfiguration : public ::testing::Test { } }; -// Encryption configuration : Encrypt two columns and the footer, with different keys. +// Encryption configuration 1: Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST_F(TestEncryptionConfiguration, UniformEncryption) { + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), + file_name_); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 1 /* encryption_configuration_number */); + } +} + +// Encryption configuration 2: Encrypt two columns and the footer, with different keys. TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { std::map, std::shared_ptr, @@ -552,5 +650,138 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { } } +// Encryption configuration 3: Encrypt two columns, with different keys. +// Don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( + path_to_float_field_); + encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); + encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build(), + file_name_); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 3 /* encryption_configuration_number */); + } +} + +// Encryption configuration 4: Encrypt two columns and the footer, with different keys. +// Use aad_prefix. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( + path_to_float_field_); + encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); + encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(kFileName_) + ->build(), + file_name_); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 4 /* encryption_configuration_number */); + } +} + +// Encryption configuration 5: Encrypt two columns and the footer, with different keys. +// Use aad_prefix and disable_aad_prefix_storage. +TEST_F(TestEncryptionConfiguration, + EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( + path_to_float_field_); + encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); + encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_store_aad_prefix_storage() + ->build(), + file_name_); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 5 /* encryption_configuration_number */); + } +} + +// Encryption configuration 6: Encrypt two columns and the footer, with different keys. +// Use AES_GCM_CTR_V1 algorithm. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( + path_to_float_field_); + encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); + encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build(), + file_name_); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); + ++example_id) { + DecryptFile(file_name_, example_id, 6 /* encryption_configuration_number */); + } +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 02c80386429b..ffde8823c723 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -433,8 +433,7 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( "Invalid parquet file. Cannot verify plaintext mode footer."); } - auto encryptor = file_decryptor_->GetFooterSigningEncryptor(); - if (!file_metadata_->VerifySignature(encryptor, + if (!file_metadata_->VerifySignature(file_decryptor_.get(), metadata_buffer->data() + read_metadata_len)) { throw ParquetException( "Invalid parquet file. Could not verify plaintext " diff --git a/cpp/src/parquet/internal_file_decryptor.cc b/cpp/src/parquet/internal_file_decryptor.cc index 9af59ae01e31..669c6b0ab6ff 100644 --- a/cpp/src/parquet/internal_file_decryptor.cc +++ b/cpp/src/parquet/internal_file_decryptor.cc @@ -21,29 +21,6 @@ namespace parquet { -// FooterSigningEncryptor - -FooterSigningEncryptor::FooterSigningEncryptor(ParquetCipher::type algorithm, - const std::string& key, - const std::string& file_aad, - const std::string& aad) - : key_(key), file_aad_(file_aad), aad_(aad) { - aes_encryptor_.reset(encryption::AesEncryptor::Make( - algorithm, static_cast(key_.size()), true, NULLPTR)); -} - -int FooterSigningEncryptor::CiphertextSizeDelta() { - return aes_encryptor_->CiphertextSizeDelta(); -} - -int FooterSigningEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, - uint8_t* nonce, - uint8_t* encrypted_footer) { - return aes_encryptor_->SignedFooterEncrypt( - footer, footer_len, str2bytes(key_), static_cast(key_.size()), str2bytes(aad_), - static_cast(aad_.size()), nonce, encrypted_footer); -} - // Decryptor Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key, const std::string& file_aad, const std::string& aad, @@ -88,9 +65,7 @@ void InternalFileDecryptor::WipeOutDecryptionKeys() { } } -std::shared_ptr -InternalFileDecryptor::GetFooterSigningEncryptor() { - if (footer_signing_encryptor_ != NULLPTR) return footer_signing_encryptor_; +std::string InternalFileDecryptor::GetFooterKey() { std::string footer_key = properties_->footer_key(); // ignore footer key metadata if footer key is explicitly set via API if (footer_key.empty()) { @@ -111,12 +86,7 @@ InternalFileDecryptor::GetFooterSigningEncryptor() { "Footer key unavailable. Could not verify " "plaintext footer metadata"); } - - std::string aad = encryption::CreateFooterAad(file_aad_); - - footer_signing_encryptor_ = - std::make_shared(algorithm_, footer_key, file_aad_, aad); - return footer_signing_encryptor_; + return footer_key; } std::shared_ptr InternalFileDecryptor::GetFooterDecryptor() { diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 760337003296..62812888f721 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -34,22 +34,6 @@ class AesEncryptor; class FileDecryptionProperties; -class FooterSigningEncryptor { - public: - FooterSigningEncryptor(ParquetCipher::type algorithm, const std::string& key, - const std::string& file_aad, const std::string& aad); - int CiphertextSizeDelta(); - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* nonce, - uint8_t* encrypted_footer); - - private: - std::string key_; - std::string file_aad_; - std::string aad_; - - std::shared_ptr aes_encryptor_; -}; - class Decryptor { public: Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, @@ -81,12 +65,12 @@ class InternalFileDecryptor { std::string& file_aad() { return file_aad_; } + std::string GetFooterKey(); + ParquetCipher::type algorithm() { return algorithm_; } std::string& footer_key_metadata() { return footer_key_metadata_; } - std::shared_ptr GetFooterSigningEncryptor(); - FileDecryptionProperties* properties() { return properties_; } void WipeOutDecryptionKeys(); @@ -116,7 +100,6 @@ class InternalFileDecryptor { std::shared_ptr footer_data_decryptor_; ParquetCipher::type algorithm_; std::string footer_key_metadata_; - std::shared_ptr footer_signing_encryptor_; std::vector all_decryptors_; /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index b1c4f0e08657..bdc657c61e65 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -518,8 +518,7 @@ class FileMetaData::FileMetaDataImpl { } #ifdef PARQUET_ENCRYPTION - bool VerifySignature(std::shared_ptr encryptor, - const void* signature) { + bool VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature) { // serialize the footer uint8_t* serialized_data; uint32_t serialized_len = metadata_len_; @@ -531,10 +530,20 @@ class FileMetaData::FileMetaDataImpl { uint8_t* tag = const_cast(reinterpret_cast(signature)) + encryption::kNonceLength; - std::vector encrypted_buffer(encryptor->CiphertextSizeDelta() + + std::string key = file_decryptor->GetFooterKey(); + std::string aad = encryption::CreateFooterAad(file_decryptor->file_aad()); + + auto aes_encryptor = encryption::AesEncryptor::Make( + file_decryptor->algorithm(), static_cast(key.size()), true, NULLPTR); + + std::vector encrypted_buffer(aes_encryptor->CiphertextSizeDelta() + serialized_len); - uint32_t encrypted_len = encryptor->SignedFooterEncrypt( - serialized_data, serialized_len, nonce, encrypted_buffer.data()); + uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt( + serialized_data, serialized_len, str2bytes(key), static_cast(key.size()), + str2bytes(aad), static_cast(aad.size()), nonce, encrypted_buffer.data()); + // Delete AES encryptor object. It was created only to verify the footer signature. + aes_encryptor->WipeOut(); + delete aes_encryptor; return 0 == memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, tag, encryption::kGcmTagLength); @@ -705,9 +714,9 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { } #ifdef PARQUET_ENCRYPTION -bool FileMetaData::VerifySignature(std::shared_ptr encryptor, +bool FileMetaData::VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature) { - return impl_->VerifySignature(encryptor, signature); + return impl_->VerifySignature(file_decryptor, signature); } #endif diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 40da66ec8141..c50924f290b5 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -212,8 +212,7 @@ class PARQUET_EXPORT FileMetaData { /// Verify signature of FileMetadata when file is encrypted but footer is not encrypted /// (plaintext footer). /// Signature is 28 bytes (12 byte nonce and 16 byte tags) when encrypting FileMetadata - bool VerifySignature(std::shared_ptr encryptor, - const void* signature); + bool VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature); #endif // file metadata From 0a0e0f816d87962b20e7d48dca06f19a0ba875d7 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Thu, 11 Jul 2019 22:47:30 +0700 Subject: [PATCH 127/201] remove some PARQUET_ENCRYPTION define check --- cpp/src/parquet/column_reader.cc | 13 +++--- cpp/src/parquet/file_reader.cc | 75 +++++++++++++++----------------- cpp/src/parquet/file_writer.cc | 71 ++++++++++++------------------ cpp/src/parquet/metadata.cc | 32 ++------------ cpp/src/parquet/metadata.h | 8 ---- cpp/src/parquet/properties.h | 2 +- cpp/src/parquet/thrift.h | 18 +++----- 7 files changed, 82 insertions(+), 137 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f2ae8454a5e2..7b029b7e74e6 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -139,13 +139,13 @@ class SerializedPageReader : public PageReader { column_has_dictionary_ = ctx->column_has_dictionary; row_group_ordinal_ = ctx->row_group_ordinal; column_ordinal_ = ctx->column_ordinal; -#ifdef PARQUET_ENCRYPTION meta_decryptor_ = ctx->meta_decryptor; data_decryptor_ = ctx->data_decryptor; if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { +#ifdef PARQUET_ENCRYPTION InitDecryption(); - } #endif + } } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodecFromArrow(codec); @@ -255,11 +255,13 @@ std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with #ifdef PARQUET_ENCRYPTION + // we must use #ifdef here because current_page_is_dictionary is not used bool current_page_is_dictionary = false; #endif if (column_has_dictionary_) { if (first_page_) { #ifdef PARQUET_ENCRYPTION + // we must use #ifdef here because current_page_is_dictionary is not used current_page_is_dictionary = true; #endif first_page_ = false; @@ -292,12 +294,9 @@ std::shared_ptr SerializedPageReader::NextPage() { UpdateDecryption(meta_decryptor_, current_page_is_dictionary, encryption::kDictionaryPageHeader, data_page_headerAAD_); } +#endif DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, meta_decryptor_); -#else - DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_); -#endif // PARQUET_ENCRYPTION break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -331,8 +330,8 @@ std::shared_ptr SerializedPageReader::NextPage() { ParquetException::EofException(ss.str()); } -#ifdef PARQUET_ENCRYPTION // Decrypt it if we need to +#ifdef PARQUET_ENCRYPTION if (data_decryptor_ != nullptr) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( compressed_len - data_decryptor_->CiphertextSizeDelta())); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index ffde8823c723..8e681dc42ad5 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -133,23 +133,26 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr stream = properties_.GetStream(source_, col_start, col_length); -#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata = col->crypto_metadata(); + PageReaderContext ctx; // Column is encrypted only if crypto_metadata exists. if (!crypto_metadata) { - PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, - static_cast(i), NULLPTR, NULLPTR}; + ctx = PageReaderContext{col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), NULLPTR, NULLPTR}; return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); } +#ifdef PARQUET_ENCRYPTION // The column is encrypted + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; // The column is encrypted with footer key if (crypto_metadata->encrypted_with_footer_key()) { - auto meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); - auto data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); + meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); + data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, static_cast(i), meta_decryptor, data_decryptor}; return PageReader::Open(stream, col->num_values(), col->compression(), @@ -161,19 +164,16 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr column_path = std::make_shared(crypto_metadata->path_in_schema()); - auto meta_decryptor = + meta_decryptor = file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata); - auto data_decryptor = + data_decryptor = file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); - PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, - static_cast(i), meta_decryptor, data_decryptor}; + ctx = PageReaderContext{col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor}; +#endif // PARQUET_ENCRYPTION return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); -#else - return PageReader::Open(stream, col->num_values(), col->compression(), - properties_.memory_pool()); -#endif } private: @@ -211,13 +211,12 @@ class SerializedFile : public ParquetFileReader::Contents { } std::shared_ptr GetRowGroup(int i) override { + InternalFileDecryptor* file_decryptor = NULLPTR; #ifdef PARQUET_ENCRYPTION - std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); -#else - std::unique_ptr contents( - new SerializedRowGroup(source_, file_metadata_.get(), i, properties_)); + file_decryptor = file_decryptor_.get(); #endif + std::unique_ptr contents(new SerializedRowGroup( + source_, file_metadata_.get(), i, properties_, file_decryptor)); return std::make_shared(std::move(contents)); } @@ -266,35 +265,33 @@ class SerializedFile : public ParquetFileReader::Contents { #endif #if PARQUET_ENCRYPTION - // No encryption or encryption with plaintext footer mode. - if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) == 0) { - std::shared_ptr metadata_buffer; - uint32_t metadata_len, read_metadata_len; - ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, - &metadata_buffer, &metadata_len, &read_metadata_len); - - auto file_decryption_properties = properties_.file_decryption_properties(); - if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. - if (file_decryption_properties != NULLPTR) { - if (!file_decryption_properties->plaintext_files_allowed()) { - throw ParquetException("Applying decryption properties on plaintext file"); - } - } - } else { - // Encrypted file with plaintext footer mode. - ParseMetaDataOfEncryptedFileWithPlaintextFooter( - file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); - } - } else { + if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0) { // Encrypted file with Encrypted footer. ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, file_size); + return; } -#else // not defined PARQUET_ENCRYPTION +#endif + + // No encryption or encryption with plaintext footer mode. std::shared_ptr metadata_buffer; uint32_t metadata_len, read_metadata_len; ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, &metadata_buffer, &metadata_len, &read_metadata_len); + +#ifdef PARQUET_ENCRYPTION + auto file_decryption_properties = properties_.file_decryption_properties(); + if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. + if (file_decryption_properties != NULLPTR) { + if (!file_decryption_properties->plaintext_files_allowed()) { + throw ParquetException("Applying decryption properties on plaintext file"); + } + } + } else { + // Encrypted file with plaintext footer mode. + ParseMetaDataOfEncryptedFileWithPlaintextFooter( + file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); + } #endif } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index f4e6426a47a0..721c40dcee38 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -134,24 +134,21 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ++next_column_index_; const ColumnDescriptor* column_descr = col_meta->descr(); -#ifdef PARQUET_ENCRYPTION - auto meta_encryptor = - file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - auto data_encryptor = - file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; + std::shared_ptr meta_encryptor = NULLPTR; + std::shared_ptr data_encryptor = NULLPTR; +#ifdef PARQUET_ENCRYPTION + meta_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + data_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; +#endif std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, row_group_ordinal_, static_cast(current_column_index_ - 1), properties_->memory_pool(), false, meta_encryptor, data_encryptor); -#else - std::unique_ptr pager = PageWriter::Open( - sink_, properties_->compression(column_descr->path()), col_meta, - row_group_ordinal_, static_cast(current_column_index_ - 1), - properties_->memory_pool(), false); -#endif column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); } @@ -248,25 +245,22 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); + + std::shared_ptr meta_encryptor = NULLPTR; + std::shared_ptr data_encryptor = NULLPTR; #ifdef PARQUET_ENCRYPTION - auto meta_encryptor = - file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - auto data_encryptor = - file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; + meta_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + data_encryptor = file_encryptor_ + ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; +#endif std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, static_cast(row_group_ordinal_), static_cast(current_column_index_), properties_->memory_pool(), buffered_row_group_, meta_encryptor, data_encryptor); -#else - std::unique_ptr pager = - PageWriter::Open(sink_, properties_->compression(column_descr->path()), - col_meta, static_cast(row_group_ordinal_), - static_cast(current_column_index_), - properties_->memory_pool(), buffered_row_group_); -#endif column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); } @@ -305,7 +299,6 @@ class FileSerializer : public ParquetFileWriter::Contents { } row_group_writer_.reset(); -#ifdef PARQUET_ENCRYPTION // Write magic bytes and metadata auto file_encryption_properties = properties_->file_encryption_properties(); @@ -313,12 +306,10 @@ class FileSerializer : public ParquetFileWriter::Contents { file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); } else { // Encrypted file +#ifdef PARQUET_ENCRYPTION CloseEncryptedFile(file_encryption_properties); - } -#else - file_metadata_ = metadata_->Finish(); - WriteFileMetaData(*file_metadata_, sink_.get()); #endif + } } } @@ -338,15 +329,14 @@ class FileSerializer : public ParquetFileWriter::Contents { } num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); + InternalFileEncryptor* file_encryptor = NULLPTR; + #ifdef PARQUET_ENCRYPTION + file_encryptor = file_encryptor_.get(); +#endif std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), - buffered_row_group, file_encryptor_.get())); -#else - std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), - buffered_row_group)); -#endif + buffered_row_group, file_encryptor)); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } @@ -423,12 +413,12 @@ class FileSerializer : public ParquetFileWriter::Contents { #endif void StartFile() { -#ifdef PARQUET_ENCRYPTION auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties == nullptr) { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { +#ifdef PARQUET_ENCRYPTION file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, properties_->memory_pool())); if (file_encryption_properties->encrypted_footer()) { @@ -437,11 +427,8 @@ class FileSerializer : public ParquetFileWriter::Contents { // Encrypted file with plaintext footer mode. PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } - } -#else - // Unencrypted parquet files always start with PAR1 - PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); #endif + } } }; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index bdc657c61e65..298760c80e09 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -47,10 +47,6 @@ using ::std::smatch; #ifdef PARQUET_ENCRYPTION #include "parquet/internal_file_decryptor.h" -#else -namespace parquet { -class Decryptor; -} #endif namespace parquet { @@ -133,7 +129,6 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d // MetaData Accessor -#ifdef PARQUET_ENCRYPTION // ColumnCryptoMetaData class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl { public: @@ -179,7 +174,6 @@ bool ColumnCryptoMetaData::encrypted_with_footer_key() const { const std::string& ColumnCryptoMetaData::key_metadata() const { return impl_->key_metadata(); } -#endif // PARQUET_ENCRYPTION // ColumnChunk metadata class ColumnChunkMetaData::ColumnChunkMetaDataImpl { @@ -190,8 +184,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ApplicationVersion* writer_version, InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { -#ifdef PARQUET_ENCRYPTION if (column->__isset.crypto_metadata) { // column metadata is encrypted +#ifdef PARQUET_ENCRYPTION format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { @@ -216,12 +210,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { is_metadata_set_ = true; } +#endif // PARQUET_ENCRYPTION } else { // column metadata is not encrypted is_metadata_set_ = true; } -#else - is_metadata_set_ = true; -#endif // PARQUET_ENCRYPTION if (is_metadata_set_) { const format::ColumnMetaData& meta_data = GetMetadataIfSet(); @@ -303,7 +295,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return GetMetadataIfSet().total_uncompressed_size; } -#ifdef PARQUET_ENCRYPTION inline std::unique_ptr crypto_metadata() const { if (column_->__isset.crypto_metadata) { return ColumnCryptoMetaData::Make( @@ -312,7 +303,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return nullptr; } } -#endif private: mutable std::shared_ptr possible_stats_; @@ -324,7 +314,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { bool is_metadata_set_; inline const format::ColumnMetaData& GetMetadataIfSet() const { -#ifdef PARQUET_ENCRYPTION if (column_->__isset.crypto_metadata && column_->crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { if (!is_metadata_set_) { @@ -337,9 +326,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { return column_->meta_data; } -#else - return column_->meta_data; -#endif } }; @@ -419,11 +405,9 @@ int64_t ColumnChunkMetaData::total_compressed_size() const { return impl_->total_compressed_size(); } -#ifdef PARQUET_ENCRYPTION std::unique_ptr ColumnChunkMetaData::crypto_metadata() const { return impl_->crypto_metadata(); } -#endif // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { @@ -562,7 +546,6 @@ class FileMetaData::FileMetaDataImpl { return static_cast(metadata_->schema.size()); } -#ifdef PARQUET_ENCRYPTION inline bool is_encryption_algorithm_set() const { return metadata_->__isset.encryption_algorithm; } @@ -572,17 +555,16 @@ class FileMetaData::FileMetaDataImpl { inline const std::string& footer_signing_key_metadata() { return metadata_->footer_signing_key_metadata; } -#endif const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; -#ifdef PARQUET_ENCRYPTION // Only in encrypted files with plaintext footers the // encryption_algorithm is set in footer if (is_encryption_algorithm_set()) { +#ifdef PARQUET_ENCRYPTION uint8_t* serialized_data; uint32_t serialized_len; serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); @@ -601,13 +583,11 @@ class FileMetaData::FileMetaDataImpl { PARQUET_THROW_NOT_OK( dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, encryption::kGcmTagLength)); +#endif // PARQUET_ENCRYPTION } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor); } -#else - serializer.Serialize(metadata_.get(), dst); -#endif // PARQUET_ENCRYPTION } std::unique_ptr RowGroup(int i) { @@ -728,7 +708,6 @@ int64_t FileMetaData::num_rows() const { return impl_->num_rows(); } int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); } -#ifdef PARQUET_ENCRYPTION bool FileMetaData::is_encryption_algorithm_set() const { return impl_->is_encryption_algorithm_set(); } @@ -740,7 +719,6 @@ EncryptionAlgorithm FileMetaData::encryption_algorithm() const { const std::string& FileMetaData::footer_signing_key_metadata() const { return impl_->footer_signing_key_metadata(); } -#endif // PARQUET_ENCRYPTION ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { @@ -780,7 +758,6 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, return impl_->WriteTo(dst, encryptor); } -#ifdef PARQUET_ENCRYPTION class FileCryptoMetaData::FileCryptoMetaDataImpl { public: FileCryptoMetaDataImpl() {} @@ -833,7 +810,6 @@ FileCryptoMetaData::~FileCryptoMetaData() {} void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const { impl_->WriteTo(dst); } -#endif // PARQUET_ENCRYPTION ApplicationVersion::ApplicationVersion(const std::string& application, int major, int minor, int patch) diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index c50924f290b5..824f7bb48e88 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -105,7 +105,6 @@ class PARQUET_EXPORT ApplicationVersion { SortOrder::type sort_order = SortOrder::SIGNED) const; }; -#ifdef PARQUET_ENCRYPTION class PARQUET_EXPORT ColumnCryptoMetaData { public: static std::unique_ptr Make(const uint8_t* metadata); @@ -121,7 +120,6 @@ class PARQUET_EXPORT ColumnCryptoMetaData { class ColumnCryptoMetaDataImpl; std::unique_ptr impl_; }; -#endif class PARQUET_EXPORT ColumnChunkMetaData { public: @@ -155,9 +153,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t index_page_offset() const; int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; -#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata() const; -#endif private: explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, @@ -229,11 +225,9 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr RowGroup(int i) const; const ApplicationVersion& writer_version() const; -#ifdef PARQUET_ENCRYPTION bool is_encryption_algorithm_set() const; EncryptionAlgorithm encryption_algorithm() const; const std::string& footer_signing_key_metadata() const; -#endif void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; @@ -261,7 +255,6 @@ class PARQUET_EXPORT FileMetaData { std::unique_ptr impl_; }; -#ifdef PARQUET_ENCRYPTION class PARQUET_EXPORT FileCryptoMetaData { public: // API convenience to get a MetaData accessor @@ -283,7 +276,6 @@ class PARQUET_EXPORT FileCryptoMetaData { class FileCryptoMetaDataImpl; std::unique_ptr impl_; }; -#endif // Builder API class PARQUET_EXPORT ColumnChunkMetaDataBuilder { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 9a6256d0e372..20c8302217ce 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -429,11 +429,11 @@ class PARQUET_EXPORT WriterProperties { return column_properties(path).max_statistics_size(); } -#ifdef PARQUET_ENCRYPTION inline FileEncryptionProperties* file_encryption_properties() const { return file_encryption_properties_.get(); } +#ifdef PARQUET_ENCRYPTION std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 3d498fc0253a..108fc1d1f34b 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -232,11 +232,11 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len, template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg, const std::shared_ptr& decryptor = NULLPTR) { -#ifdef PARQUET_ENCRYPTION // thrift message is not encrypted if (decryptor == NULLPTR) { DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); } else { // thrift message is encrypted +#ifdef PARQUET_ENCRYPTION uint32_t clen; clen = *len; // decrypt @@ -253,10 +253,8 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, deserialized_msg); - } -#else - DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); #endif // PARQUET_ENCRYPTION + } } /// Utility class to serialize thrift objects to a binary format. This object @@ -293,7 +291,6 @@ class ThriftSerializer { uint32_t out_length; SerializeToBuffer(obj, &out_length, &out_buffer); -#ifdef PARQUET_ENCRYPTION // obj is not encrypted if (encryptor == NULLPTR) { PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); @@ -301,10 +298,6 @@ class ThriftSerializer { } else { // obj is encrypted return SerializeEncryptedObj(out, out_buffer, out_length, encryptor); } -#else - PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); - return static_cast(out_length); -#endif } private: @@ -320,21 +313,22 @@ class ThriftSerializer { } } -#ifdef PARQUET_ENCRYPTION int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, uint32_t out_length, const std::shared_ptr& encryptor) { + int cipher_buffer_len = 0; +#ifdef PARQUET_ENCRYPTION std::shared_ptr cipher_buffer = std::static_pointer_cast(AllocateBuffer( encryptor->pool(), static_cast(encryptor->CiphertextSizeDelta() + out_length))); - int cipher_buffer_len = + cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); +#endif return static_cast(cipher_buffer_len); } -#endif shared_ptr mem_buffer_; shared_ptr protocol_; From 00d4ad8cb06a0323768e8e6b6d7addfe7283ba0e Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 16 Jul 2019 22:14:26 +0700 Subject: [PATCH 128/201] remove PARQUET_ENCRYPTION ifdefs and add encryption_internal-nossl.cc for compile issue without openssl --- cpp/src/parquet/CMakeLists.txt | 13 ++- cpp/src/parquet/column_reader.cc | 25 +---- cpp/src/parquet/column_writer.cc | 23 +--- cpp/src/parquet/encryption.cc | 6 +- cpp/src/parquet/encryption_internal-nossl.cc | 108 +++++++++++++++++++ cpp/src/parquet/encryption_internal.cc | 2 + cpp/src/parquet/encryption_internal.h | 2 + cpp/src/parquet/file_reader.cc | 44 +------- cpp/src/parquet/file_writer.cc | 60 +++-------- cpp/src/parquet/file_writer.h | 2 - cpp/src/parquet/metadata.cc | 26 +---- cpp/src/parquet/metadata.h | 5 +- cpp/src/parquet/properties.h | 15 --- cpp/src/parquet/thrift.h | 19 +--- 14 files changed, 151 insertions(+), 199 deletions(-) create mode 100644 cpp/src/parquet/encryption_internal-nossl.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index fdf40a790fcb..7e540287824d 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -167,8 +167,11 @@ set(PARQUET_SRCS column_writer.cc deprecated_io.cc encoding.cc + encryption.cc file_reader.cc file_writer.cc + internal_file_decryptor.cc + internal_file_encryptor.cc metadata.cc murmur3.cc parquet_constants.cpp @@ -181,13 +184,13 @@ set(PARQUET_SRCS types.cc) if(ARROW_USE_OPENSSL) - add_definitions(-DPARQUET_ENCRYPTION) set(PARQUET_SRCS ${PARQUET_SRCS} - encryption.cc - encryption_internal.cc - internal_file_decryptor.cc - internal_file_encryptor.cc) + encryption_internal.cc) +else() + set(PARQUET_SRCS + ${PARQUET_SRCS} + encryption_internal-nossl.cc) endif() # Ensure that thrift compilation is done before using its generated headers diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 7b029b7e74e6..16351a3a3761 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -40,15 +40,12 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" // IWYU pragma: keep -#ifdef PARQUET_ENCRYPTION -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_decryptor.h" -#endif - using arrow::MemoryPool; using arrow::internal::checked_cast; @@ -142,9 +139,7 @@ class SerializedPageReader : public PageReader { meta_decryptor_ = ctx->meta_decryptor; data_decryptor_ = ctx->data_decryptor; if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { -#ifdef PARQUET_ENCRYPTION InitDecryption(); -#endif } } max_page_header_size_ = kDefaultMaxPageHeaderSize; @@ -157,13 +152,11 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: -#ifdef PARQUET_ENCRYPTION void UpdateDecryption(const std::shared_ptr& decryptor, bool current_page_is_dictionary, int8_t module_type, const std::string& pageAAD); void InitDecryption(); -#endif std::shared_ptr stream_; @@ -217,7 +210,6 @@ class SerializedPageReader : public PageReader { std::shared_ptr data_decryptor_; }; -#ifdef PARQUET_ENCRYPTION void SerializedPageReader::InitDecryption() { // Prepare the AAD for quick update later. if (data_decryptor_ != NULLPTR) { @@ -249,21 +241,14 @@ void SerializedPageReader::UpdateDecryption(const std::shared_ptr& de decryptor->UpdateAad(pageAAD); } } -#endif // PARQUET_ENCRYPTION std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with -#ifdef PARQUET_ENCRYPTION - // we must use #ifdef here because current_page_is_dictionary is not used bool current_page_is_dictionary = false; -#endif if (column_has_dictionary_) { if (first_page_) { -#ifdef PARQUET_ENCRYPTION - // we must use #ifdef here because current_page_is_dictionary is not used current_page_is_dictionary = true; -#endif first_page_ = false; } else { page_ordinal_++; @@ -289,12 +274,10 @@ std::shared_ptr SerializedPageReader::NextPage() { // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { -#ifdef PARQUET_ENCRYPTION if (meta_decryptor_ != NULLPTR) { UpdateDecryption(meta_decryptor_, current_page_is_dictionary, encryption::kDictionaryPageHeader, data_page_headerAAD_); } -#endif DeserializeThriftMsg(reinterpret_cast(buffer.data()), &header_size, ¤t_page_header_, meta_decryptor_); break; @@ -314,12 +297,10 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; -#ifdef PARQUET_ENCRYPTION if (data_decryptor_ != NULLPTR) { UpdateDecryption(data_decryptor_, current_page_is_dictionary, encryption::kDictionaryPage, data_pageAAD_); } -#endif // Read the compressed data page. std::shared_ptr page_buffer; PARQUET_THROW_NOT_OK(stream_->Read(compressed_len, &page_buffer)); @@ -331,7 +312,6 @@ std::shared_ptr SerializedPageReader::NextPage() { } // Decrypt it if we need to -#ifdef PARQUET_ENCRYPTION if (data_decryptor_ != nullptr) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( compressed_len - data_decryptor_->CiphertextSizeDelta())); @@ -340,7 +320,6 @@ std::shared_ptr SerializedPageReader::NextPage() { page_buffer = decryption_buffer_; } -#endif // PARQUET_ENCRYPTION // Uncompress it if we need to if (decompressor_ != nullptr) { // Grow the uncompressed buffer if we need to. diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 21a937428581..6c687776e973 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -38,6 +38,8 @@ #include "parquet/column_page.h" #include "parquet/encoding.h" +#include "parquet/encryption_internal.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -46,11 +48,6 @@ #include "parquet/thrift.h" #include "parquet/types.h" -#ifdef PARQUET_ENCRYPTION -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_encryptor.h" -#endif - namespace parquet { using arrow::Status; @@ -161,9 +158,7 @@ class SerializedPageWriter : public PageWriter { meta_encryptor_(meta_encryptor), data_encryptor_(data_encryptor) { if (data_encryptor_ != NULLPTR || meta_encryptor_ != NULLPTR) { -#ifdef PARQUET_ENCRYPTION InitEncryption(); -#endif } compressor_ = GetCodecFromArrow(codec); thrift_serializer_.reset(new ThriftSerializer); @@ -189,7 +184,6 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int32_t output_data_len = static_cast(compressed_data->size()); -#ifdef PARQUET_ENCRYPTION std::shared_ptr encrypted_data_buffer = nullptr; if (data_encryptor_.get()) { UpdateEncryption(encryption::kDictionaryPage); @@ -199,7 +193,6 @@ class SerializedPageWriter : public PageWriter { encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } -#endif format::PageHeader page_header; page_header.__set_type(format::PageType::DICTIONARY_PAGE); @@ -214,11 +207,9 @@ class SerializedPageWriter : public PageWriter { dictionary_page_offset_ = start_pos; } -#ifdef PARQUET_ENCRYPTION if (meta_encryptor_) { UpdateEncryption(encryption::kDictionaryPageHeader); } -#endif int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); @@ -233,11 +224,9 @@ class SerializedPageWriter : public PageWriter { } void Close(bool has_dictionary, bool fallback) override { -#ifdef PARQUET_ENCRYPTION if (meta_encryptor_ != nullptr) { UpdateEncryption(encryption::kColumnMetaData); } -#endif // index_page_offset = -1 since they are not supported metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_, total_compressed_size_, total_uncompressed_size_, has_dictionary, @@ -282,7 +271,6 @@ class SerializedPageWriter : public PageWriter { const uint8_t* output_data_buffer = compressed_data->data(); int32_t output_data_len = static_cast(compressed_data->size()); -#ifdef PARQUET_ENCRYPTION std::shared_ptr encrypted_data_buffer = AllocateBuffer(pool_, 0); if (data_encryptor_.get()) { UpdateEncryption(encryption::kDataPage); @@ -292,7 +280,6 @@ class SerializedPageWriter : public PageWriter { encrypted_data_buffer->mutable_data()); output_data_buffer = encrypted_data_buffer->data(); } -#endif format::PageHeader page_header; page_header.__set_type(format::PageType::DATA_PAGE); @@ -307,11 +294,9 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } -#ifdef PARQUET_ENCRYPTION if (meta_encryptor_) { UpdateEncryption(encryption::kDataPageHeader); } -#endif int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_); PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len)); @@ -339,7 +324,6 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size() { return total_uncompressed_size_; } private: -#ifdef PARQUET_ENCRYPTION void InitEncryption() { // Prepare the AAD for quick update later. if (data_encryptor_ != NULLPTR) { @@ -388,7 +372,6 @@ class SerializedPageWriter : public PageWriter { throw ParquetException("Unknown module type in UpdateEncryption"); } } -#endif std::shared_ptr sink_; ColumnChunkMetaDataBuilder* metadata_; @@ -407,10 +390,8 @@ class SerializedPageWriter : public PageWriter { // Compression codec to use. std::unique_ptr compressor_; -#ifdef PARQUET_ENCRYPTION std::string data_pageAAD_; std::string data_page_headerAAD_; -#endif std::shared_ptr meta_encryptor_; std::shared_ptr data_encryptor_; diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index 69da7eeae562..fd94b34a7087 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -21,11 +21,11 @@ #include #include -#include - #include "arrow/util/logging.h" #include "arrow/util/utf8.h" +#include "parquet/encryption_internal.h" + namespace parquet { // integer key retriever @@ -389,7 +389,7 @@ FileEncryptionProperties::FileEncryptionProperties( uint8_t aad_file_unique[kAadFileUniqueLength]; memset(aad_file_unique, 0, kAadFileUniqueLength); - RAND_bytes(aad_file_unique, sizeof(kAadFileUniqueLength)); + encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength)); std::string aad_file_unique_str(reinterpret_cast(aad_file_unique), kAadFileUniqueLength); diff --git a/cpp/src/parquet/encryption_internal-nossl.cc b/cpp/src/parquet/encryption_internal-nossl.cc new file mode 100644 index 000000000000..d06b1bf4ca12 --- /dev/null +++ b/cpp/src/parquet/encryption_internal-nossl.cc @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption_internal.h" +#include "parquet/exception.h" + +namespace parquet { +namespace encryption { + +void ThrowOpenSSLRequiredException() { + throw ParquetException("Encrypted parquet file. OpenSSL is required"); +} + +class AesEncryptor::AesEncryptorImpl {}; + +AesEncryptor::~AesEncryptor() {} + +int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, + int key_len, uint8_t* aad, int aad_len, + uint8_t* nonce, uint8_t* encrypted_footer) { + ThrowOpenSSLRequiredException(); + return -1; +} + +void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); } + +int AesEncryptor::CiphertextSizeDelta() { + ThrowOpenSSLRequiredException(); + return -1; +} + +int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, + int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { + ThrowOpenSSLRequiredException(); + return -1; +} + +AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) { + ThrowOpenSSLRequiredException(); +} + +class AesDecryptor::AesDecryptorImpl {}; + +int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, + int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { + ThrowOpenSSLRequiredException(); + return -1; +} + +void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); } + +AesDecryptor::~AesDecryptor() {} + +AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_encryptors) { + return NULLPTR; +} + +AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) { + ThrowOpenSSLRequiredException(); +} + +AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::vector* all_decryptors) { + return NULLPTR; +} + +int AesDecryptor::CiphertextSizeDelta() { + ThrowOpenSSLRequiredException(); + return -1; +} + +std::string CreateModuleAad(const std::string& file_aad, int8_t module_type, + int16_t row_group_ordinal, int16_t column_ordinal, + int16_t page_ordinal) { + ThrowOpenSSLRequiredException(); + return ""; +} + +std::string CreateFooterAad(const std::string& aad_prefix_bytes) { + ThrowOpenSSLRequiredException(); + return ""; +} + +// Update last two bytes with new page ordinal (instead of creating new page AAD +// from scratch) +void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) { + ThrowOpenSSLRequiredException(); +} + +void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); } + +} // namespace encryption +} // namespace parquet diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index 695b9b5db723..c2c3f200ab46 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -602,5 +602,7 @@ void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) { reinterpret_cast(page_ordinal_bytes.c_str()), 2); } +void RandBytes(unsigned char* buf, int num) { RAND_bytes(buf, num); } + } // namespace encryption } // namespace parquet diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index 9fe82bd28da8..ec30cd5dd902 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -108,6 +108,8 @@ std::string CreateFooterAad(const std::string& aad_prefix_bytes); // Update last two bytes of page (or page header) module AAD void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal); +void RandBytes(unsigned char* buf, int num); + } // namespace encryption } // namespace parquet diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8e681dc42ad5..61813e5861ae 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -32,23 +32,16 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/deprecated_io.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/file_writer.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" -#ifdef PARQUET_ENCRYPTION -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_decryptor.h" -#else -namespace parquet { -class InternalFileDecryptor; -} -#endif - namespace parquet { // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file @@ -144,7 +137,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { properties_.memory_pool(), &ctx); } -#ifdef PARQUET_ENCRYPTION // The column is encrypted std::shared_ptr meta_decryptor; @@ -171,7 +163,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { ctx = PageReaderContext{col->has_dictionary_page(), row_group_ordinal_, static_cast(i), meta_decryptor, data_decryptor}; -#endif // PARQUET_ENCRYPTION return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); } @@ -205,18 +196,12 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { -#ifdef PARQUET_ENCRYPTION if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); -#endif } std::shared_ptr GetRowGroup(int i) override { - InternalFileDecryptor* file_decryptor = NULLPTR; -#ifdef PARQUET_ENCRYPTION - file_decryptor = file_decryptor_.get(); -#endif std::unique_ptr contents(new SerializedRowGroup( - source_, file_metadata_.get(), i, properties_, file_decryptor)); + source_, file_metadata_.get(), i, properties_, file_decryptor_.get())); return std::make_shared(std::move(contents)); } @@ -245,33 +230,18 @@ class SerializedFile : public ParquetFileReader::Contents { source_->ReadAt(file_size - footer_read_size, footer_read_size, &footer_buffer)); // Check if all bytes are read. Check if last 4 bytes read have the magic bits -#ifdef PARQUET_ENCRYPTION if (footer_buffer->size() != footer_read_size || (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { throw ParquetException("Invalid parquet file. Corrupt footer."); } -#else - if (footer_buffer->size() != footer_read_size || - (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 && - memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) { - throw ParquetException("Invalid parquet file. Corrupt footer."); - } else if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == - 0) { - throw ParquetException( - "Encrypted parquet file. " - "Should build with parquet encryption support."); - } -#endif -#if PARQUET_ENCRYPTION if (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0) { // Encrypted file with Encrypted footer. ParseMetaDataOfEncryptedFileWithEncryptedFooter(footer_buffer, footer_read_size, file_size); return; } -#endif // No encryption or encryption with plaintext footer mode. std::shared_ptr metadata_buffer; @@ -279,7 +249,6 @@ class SerializedFile : public ParquetFileReader::Contents { ParseUnencryptedFileMetadata(footer_buffer, footer_read_size, file_size, &metadata_buffer, &metadata_len, &read_metadata_len); -#ifdef PARQUET_ENCRYPTION auto file_decryption_properties = properties_.file_decryption_properties(); if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file. if (file_decryption_properties != NULLPTR) { @@ -292,7 +261,6 @@ class SerializedFile : public ParquetFileReader::Contents { ParseMetaDataOfEncryptedFileWithPlaintextFooter( file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len); } -#endif } private: @@ -300,16 +268,13 @@ class SerializedFile : public ParquetFileReader::Contents { std::shared_ptr file_metadata_; ReaderProperties properties_; -#ifdef PARQUET_ENCRYPTION std::unique_ptr file_decryptor_; -#endif void ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size, std::shared_ptr* metadata_buffer, uint32_t* metadata_len, uint32_t* read_metadata_len); -#if PARQUET_ENCRYPTION std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo); @@ -321,7 +286,6 @@ class SerializedFile : public ParquetFileReader::Contents { void ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size); -#endif }; void SerializedFile::ParseUnencryptedFileMetadata( @@ -353,7 +317,6 @@ void SerializedFile::ParseUnencryptedFileMetadata( file_metadata_ = FileMetaData::Make((*metadata_buffer)->data(), read_metadata_len); } -#ifdef PARQUET_ENCRYPTION void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr& footer_buffer, int64_t footer_read_size, int64_t file_size) { @@ -481,7 +444,6 @@ std::string SerializedFile::HandleAadPrefix( } return aad_prefix + algo.aad.aad_file_unique; } -#endif // PARQUET_ENCRYPTION // ---------------------------------------------------------------------- // ParquetFileReader public API diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 721c40dcee38..90c721fa3e7e 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -24,19 +24,12 @@ #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" +#include "parquet/encryption_internal.h" #include "parquet/exception.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/schema.h" -#ifdef PARQUET_ENCRYPTION -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_encryptor.h" -#else -namespace parquet { -class InternalFileEncryptor; -} -#endif - using arrow::MemoryPool; using parquet::schema::GroupNode; @@ -135,16 +128,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const ColumnDescriptor* column_descr = col_meta->descr(); - std::shared_ptr meta_encryptor = NULLPTR; - std::shared_ptr data_encryptor = NULLPTR; -#ifdef PARQUET_ENCRYPTION - meta_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - data_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; -#endif + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, row_group_ordinal_, static_cast(current_column_index_ - 1), @@ -246,16 +235,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto col_meta = metadata_->NextColumnChunk(); const ColumnDescriptor* column_descr = col_meta->descr(); - std::shared_ptr meta_encryptor = NULLPTR; - std::shared_ptr data_encryptor = NULLPTR; -#ifdef PARQUET_ENCRYPTION - meta_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) - : NULLPTR; - data_encryptor = file_encryptor_ - ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) - : NULLPTR; -#endif + auto meta_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(column_descr->path()) + : NULLPTR; + auto data_encryptor = + file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(column_descr->path()) + : NULLPTR; std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, static_cast(row_group_ordinal_), @@ -306,9 +291,7 @@ class FileSerializer : public ParquetFileWriter::Contents { file_metadata_ = metadata_->Finish(); WriteFileMetaData(*file_metadata_, sink_.get()); } else { // Encrypted file -#ifdef PARQUET_ENCRYPTION CloseEncryptedFile(file_encryption_properties); -#endif } } } @@ -329,14 +312,9 @@ class FileSerializer : public ParquetFileWriter::Contents { } num_row_groups_++; auto rg_metadata = metadata_->AppendRowGroup(); - InternalFileEncryptor* file_encryptor = NULLPTR; - -#ifdef PARQUET_ENCRYPTION - file_encryptor = file_encryptor_.get(); -#endif std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), - buffered_row_group, file_encryptor)); + buffered_row_group, file_encryptor_.get())); row_group_writer_.reset(new RowGroupWriter(std::move(contents))); return row_group_writer_.get(); } @@ -367,7 +345,6 @@ class FileSerializer : public ParquetFileWriter::Contents { StartFile(); } -#ifdef PARQUET_ENCRYPTION void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) { // Encrypted file with encrypted footer if (file_encryption_properties->encrypted_footer()) { @@ -397,7 +374,6 @@ class FileSerializer : public ParquetFileWriter::Contents { file_encryptor_->WipeOutEncryptionKeys(); } } -#endif std::shared_ptr sink_; bool is_open_; @@ -408,9 +384,7 @@ class FileSerializer : public ParquetFileWriter::Contents { // Only one of the row group writers is active at a time std::unique_ptr row_group_writer_; -#ifdef PARQUET_ENCRYPTION std::unique_ptr file_encryptor_; -#endif void StartFile() { auto file_encryption_properties = properties_->file_encryption_properties(); @@ -418,7 +392,6 @@ class FileSerializer : public ParquetFileWriter::Contents { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { -#ifdef PARQUET_ENCRYPTION file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, properties_->memory_pool())); if (file_encryption_properties->encrypted_footer()) { @@ -427,7 +400,6 @@ class FileSerializer : public ParquetFileWriter::Contents { // Encrypted file with plaintext footer mode. PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } -#endif } } }; @@ -484,7 +456,6 @@ void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sin return WriteFileMetaData(file_metadata, sink); } -#ifdef PARQUET_ENCRYPTION void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ArrowOutputStream* sink, const std::shared_ptr& encryptor, @@ -528,7 +499,6 @@ void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ParquetOutputWrapper wrapper(sink); crypto_metadata.WriteTo(&wrapper); } -#endif // PARQUET_ENCRYPTION const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 1c97ed5c1902..693eba1e8986 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -108,7 +108,6 @@ PARQUET_EXPORT void WriteMetaDataFile(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink); -#ifdef PARQUET_ENCRYPTION PARQUET_EXPORT void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, ArrowOutputStream* sink, @@ -125,7 +124,6 @@ void WriteEncryptedFileMetadata(const FileMetaData& file_metadata, PARQUET_EXPORT void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata, ::arrow::io::OutputStream* sink); -#endif class PARQUET_EXPORT ParquetFileWriter { public: diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 298760c80e09..1f9e456ee76f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -25,7 +25,9 @@ #include #include // IWYU pragma: keep +#include "parquet/encryption_internal.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" #include "parquet/metadata.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -45,10 +47,6 @@ using ::std::regex_match; using ::std::smatch; #endif -#ifdef PARQUET_ENCRYPTION -#include "parquet/internal_file_decryptor.h" -#endif - namespace parquet { const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() { @@ -185,7 +183,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { InternalFileDecryptor* file_decryptor = NULLPTR) : column_(column), descr_(descr), writer_version_(writer_version) { if (column->__isset.crypto_metadata) { // column metadata is encrypted -#ifdef PARQUET_ENCRYPTION format::ColumnCryptoMetaData ccmd = column->crypto_metadata; if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) { @@ -210,7 +207,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } else { is_metadata_set_ = true; } -#endif // PARQUET_ENCRYPTION } else { // column metadata is not encrypted is_metadata_set_ = true; } @@ -501,7 +497,6 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } -#ifdef PARQUET_ENCRYPTION bool VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature) { // serialize the footer uint8_t* serialized_data; @@ -532,7 +527,6 @@ class FileMetaData::FileMetaDataImpl { memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, tag, encryption::kGcmTagLength); } -#endif inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } @@ -564,7 +558,6 @@ class FileMetaData::FileMetaDataImpl { // Only in encrypted files with plaintext footers the // encryption_algorithm is set in footer if (is_encryption_algorithm_set()) { -#ifdef PARQUET_ENCRYPTION uint8_t* serialized_data; uint32_t serialized_len; serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data); @@ -583,7 +576,6 @@ class FileMetaData::FileMetaDataImpl { PARQUET_THROW_NOT_OK( dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength, encryption::kGcmTagLength)); -#endif // PARQUET_ENCRYPTION } else { // either plaintext file (when encryptor is null) // or encrypted file with encrypted footer serializer.Serialize(metadata_.get(), dst, encryptor); @@ -693,12 +685,10 @@ std::unique_ptr FileMetaData::RowGroup(int i) const { return impl_->RowGroup(i); } -#ifdef PARQUET_ENCRYPTION bool FileMetaData::VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature) { return impl_->VerifySignature(file_decryptor, signature); } -#endif uint32_t FileMetaData::size() const { return impl_->size(); } @@ -982,7 +972,6 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } column_chunk_->meta_data.__set_encodings(thrift_encodings); -#ifdef PARQUET_ENCRYPTION const auto& encrypt_md = properties_->column_encryption_properties(column_->path()); // column is encrypted if (encrypt_md != NULLPTR && encrypt_md->is_encrypted()) { @@ -1036,7 +1025,6 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } } } -#endif // PARQUET_ENCRYPTION } void WriteTo(::arrow::io::OutputStream* sink) { @@ -1255,12 +1243,10 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const std::shared_ptr& key_value_metadata) : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); -#ifdef PARQUET_ENCRYPTION if (props->file_encryption_properties() != nullptr && props->file_encryption_properties()->encrypted_footer()) { crypto_metadata_.reset(new format::FileCryptoMetaData()); } -#endif } RowGroupMetaDataBuilder* AppendRowGroup() { @@ -1316,7 +1302,6 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->column_orders.resize(schema_->num_columns(), column_order); metadata_->__isset.column_orders = true; -#ifdef PARQUET_ENCRYPTION // if plaintext footer, set footer signing algorithm auto file_encryption_properties = properties_->file_encryption_properties(); if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) { @@ -1335,7 +1320,6 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata); } } -#endif // PARQUET_ENCRYPTION parquet::schema::SchemaFlattener flattener( static_cast(schema_->schema_root().get()), @@ -1347,7 +1331,6 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_meta_data; } -#ifdef PARQUET_ENCRYPTION std::unique_ptr BuildFileCryptoMetaData() { if (crypto_metadata_ == nullptr) { return nullptr; @@ -1369,13 +1352,10 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { return file_crypto_metadata; } -#endif protected: std::unique_ptr metadata_; -#ifdef PARQUET_ENCRYPTION std::unique_ptr crypto_metadata_; -#endif private: const std::shared_ptr properties_; @@ -1407,10 +1387,8 @@ RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() { std::unique_ptr FileMetaDataBuilder::Finish() { return impl_->Finish(); } -#ifdef PARQUET_ENCRYPTION std::unique_ptr FileMetaDataBuilder::GetCryptoMetaData() { return impl_->BuildFileCryptoMetaData(); } -#endif } // namespace parquet diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 824f7bb48e88..2095ac679e3c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -204,12 +204,11 @@ class PARQUET_EXPORT FileMetaData { const std::shared_ptr& decryptor = NULLPTR); ~FileMetaData(); -#ifdef PARQUET_ENCRYPTION + /// Verify signature of FileMetadata when file is encrypted but footer is not encrypted /// (plaintext footer). /// Signature is 28 bytes (12 byte nonce and 16 byte tags) when encrypting FileMetadata bool VerifySignature(InternalFileDecryptor* file_decryptor, const void* signature); -#endif // file metadata uint32_t size() const; @@ -365,10 +364,8 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Complete the Thrift structure std::unique_ptr Finish(); -#ifdef PARQUET_ENCRYPTION // crypto metadata std::unique_ptr GetCryptoMetaData(); -#endif private: explicit FileMetaDataBuilder( diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 20c8302217ce..74bb39885a4a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -26,16 +26,7 @@ #include "arrow/type.h" -#ifdef PARQUET_ENCRYPTION #include "parquet/encryption.h" -#else -namespace parquet { -class FileEncryptionProperties; -class FileDecryptionProperties; -class ColumnEncryptionProperties; -} // namespace parquet -#endif - #include "parquet/exception.h" #include "parquet/parquet_version.h" #include "parquet/platform.h" @@ -74,7 +65,6 @@ class PARQUET_EXPORT ReaderProperties { int64_t buffer_size() const { return buffer_size_; } -#ifdef PARQUET_ENCRYPTION void file_decryption_properties( const std::shared_ptr& decryption) { file_decryption_properties_ = decryption; @@ -83,7 +73,6 @@ class PARQUET_EXPORT ReaderProperties { FileDecryptionProperties* file_decryption_properties() { return file_decryption_properties_.get(); } -#endif private: MemoryPool* pool_; @@ -294,13 +283,11 @@ class PARQUET_EXPORT WriterProperties { return this->compression(path->ToDotString(), codec); } -#ifdef PARQUET_ENCRYPTION Builder* encryption( const std::shared_ptr& file_encryption_properties) { file_encryption_properties_ = file_encryption_properties; return this; } -#endif Builder* enable_statistics() { default_column_properties_.set_statistics_enabled(true); @@ -433,7 +420,6 @@ class PARQUET_EXPORT WriterProperties { return file_encryption_properties_.get(); } -#ifdef PARQUET_ENCRYPTION std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { @@ -442,7 +428,6 @@ class PARQUET_EXPORT WriterProperties { return NULLPTR; } } -#endif private: explicit WriterProperties( diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 108fc1d1f34b..29565b5a8745 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -44,22 +44,14 @@ #include "arrow/util/logging.h" #include "parquet/exception.h" +#include "parquet/internal_file_decryptor.h" +#include "parquet/internal_file_encryptor.h" #include "parquet/platform.h" #include "parquet/statistics.h" #include "parquet/types.h" #include "parquet/parquet_types.h" // IYWU pragma: export -#ifdef PARQUET_ENCRYPTION -#include "parquet/encryption_internal.h" -#include "parquet/internal_file_decryptor.h" -#include "parquet/internal_file_encryptor.h" -#else -namespace parquet { -class Encryptor; -class Decryptor; -} // namespace parquet -#endif namespace parquet { // Check if thrift version < 0.11.0 @@ -236,7 +228,6 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali if (decryptor == NULLPTR) { DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg); } else { // thrift message is encrypted -#ifdef PARQUET_ENCRYPTION uint32_t clen; clen = *len; // decrypt @@ -253,7 +244,6 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta(); DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len, deserialized_msg); -#endif // PARQUET_ENCRYPTION } } @@ -316,17 +306,14 @@ class ThriftSerializer { int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer, uint32_t out_length, const std::shared_ptr& encryptor) { - int cipher_buffer_len = 0; -#ifdef PARQUET_ENCRYPTION std::shared_ptr cipher_buffer = std::static_pointer_cast(AllocateBuffer( encryptor->pool(), static_cast(encryptor->CiphertextSizeDelta() + out_length))); - cipher_buffer_len = + int cipher_buffer_len = encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data()); PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len)); -#endif return static_cast(cipher_buffer_len); } From 91a3197bb1c0b43781db6e2bd1f1a589e9ece8d0 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 16 Jul 2019 22:19:47 +0700 Subject: [PATCH 129/201] remove PARQUET_ENCRYPTION defines from CMakeLists.txt --- cpp/examples/parquet/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 813484641fe4..ed1ae537ffd9 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -23,7 +23,6 @@ target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) if (ARROW_USE_OPENSSL) - add_definitions(-DPARQUET_ENCRYPTION) add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) target_include_directories(parquet-encryption-example PRIVATE low-level-api/) From b93d791296402f4ff719526ef12737eb422a73a9 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Wed, 17 Jul 2019 12:26:16 +0700 Subject: [PATCH 130/201] fix comments for encryption_internal_nossl.cc --- cpp/src/parquet/CMakeLists.txt | 2 +- ...ryption_internal-nossl.cc => encryption_internal_nossl.cc} | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) rename cpp/src/parquet/{encryption_internal-nossl.cc => encryption_internal_nossl.cc} (95%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 7e540287824d..0ea688c65ffb 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -190,7 +190,7 @@ if(ARROW_USE_OPENSSL) else() set(PARQUET_SRCS ${PARQUET_SRCS} - encryption_internal-nossl.cc) + encryption_internal_nossl.cc) endif() # Ensure that thrift compilation is done before using its generated headers diff --git a/cpp/src/parquet/encryption_internal-nossl.cc b/cpp/src/parquet/encryption_internal_nossl.cc similarity index 95% rename from cpp/src/parquet/encryption_internal-nossl.cc rename to cpp/src/parquet/encryption_internal_nossl.cc index d06b1bf4ca12..95c924525adc 100644 --- a/cpp/src/parquet/encryption_internal-nossl.cc +++ b/cpp/src/parquet/encryption_internal_nossl.cc @@ -22,7 +22,7 @@ namespace parquet { namespace encryption { void ThrowOpenSSLRequiredException() { - throw ParquetException("Encrypted parquet file. OpenSSL is required"); + throw ParquetException("Calling encryption method in Arrow/Parquet built without OpenSSL"); } class AesEncryptor::AesEncryptorImpl {}; @@ -96,8 +96,6 @@ std::string CreateFooterAad(const std::string& aad_prefix_bytes) { return ""; } -// Update last two bytes with new page ordinal (instead of creating new page AAD -// from scratch) void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) { ThrowOpenSSLRequiredException(); } From da1acf178e3a71f139b79dc8705339c9b2e3672e Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Wed, 17 Jul 2019 08:59:20 +0300 Subject: [PATCH 131/201] Format fixes and check that all columns in columnEncryptionProperties exist in the schema. --- cpp/src/parquet/CMakeLists.txt | 8 ++----- cpp/src/parquet/encryption.h | 2 ++ cpp/src/parquet/encryption_internal.cc | 4 ++-- cpp/src/parquet/encryption_internal_nossl.cc | 3 ++- cpp/src/parquet/file_writer.cc | 23 ++++++++++++++++++++ cpp/src/parquet/internal_file_decryptor.h | 2 ++ cpp/src/parquet/metadata.cc | 10 +++++---- 7 files changed, 39 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 0ea688c65ffb..b613b999f4dc 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -184,13 +184,9 @@ set(PARQUET_SRCS types.cc) if(ARROW_USE_OPENSSL) - set(PARQUET_SRCS - ${PARQUET_SRCS} - encryption_internal.cc) + set(PARQUET_SRCS ${PARQUET_SRCS} encryption_internal.cc) else() - set(PARQUET_SRCS - ${PARQUET_SRCS} - encryption_internal_nossl.cc) + set(PARQUET_SRCS ${PARQUET_SRCS} encryption_internal_nossl.cc) endif() # Ensure that thrift compilation is done before using its generated headers diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 4958faac2aec..23ef5a362b41 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -491,6 +491,8 @@ class PARQUET_EXPORT FileEncryptionProperties { /// aadPrefix will be cloned too) std::shared_ptr DeepClone(std::string new_aad_prefix = ""); + ColumnPathToEncryptionPropertiesMap column_properties() { return column_properties_; } + private: EncryptionAlgorithm algorithm_; std::string footer_key_; diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index c2c3f200ab46..77a32fc10737 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -277,7 +277,7 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(const uint8_t* plaintext, return kBufferSizeLength + buffer_size; } -AesEncryptor::~AesEncryptor() { impl_->~AesEncryptorImpl(); } +AesEncryptor::~AesEncryptor() {} int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, int key_len, uint8_t* aad, int aad_len, @@ -341,7 +341,7 @@ int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* void AesDecryptor::WipeOut() { impl_->WipeOut(); } -AesDecryptor::~AesDecryptor() { impl_->~AesDecryptorImpl(); } +AesDecryptor::~AesDecryptor() {} AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata) { diff --git a/cpp/src/parquet/encryption_internal_nossl.cc b/cpp/src/parquet/encryption_internal_nossl.cc index 95c924525adc..0a33f8509233 100644 --- a/cpp/src/parquet/encryption_internal_nossl.cc +++ b/cpp/src/parquet/encryption_internal_nossl.cc @@ -22,7 +22,8 @@ namespace parquet { namespace encryption { void ThrowOpenSSLRequiredException() { - throw ParquetException("Calling encryption method in Arrow/Parquet built without OpenSSL"); + throw ParquetException( + "Calling encryption method in Arrow/Parquet built without OpenSSL"); } class AesEncryptor::AesEncryptorImpl {}; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 90c721fa3e7e..8955acd174e1 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -392,6 +392,29 @@ class FileSerializer : public ParquetFileWriter::Contents { // Unencrypted parquet files always start with PAR1 PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { + // Check that all columns in columnEncryptionProperties exist in the schema. + auto columnEncryptionProperties = file_encryption_properties->column_properties(); + // if columnEncryptionProperties is empty, every column in file schema will be + // encrypted with footer key. + if (columnEncryptionProperties.size() != 0) { + std::vector> column_path_vec; + // First, save all column paths in schema. + for (int i = 0; i < num_columns(); i++) + column_path_vec.push_back(schema_.Column(i)->path()); + // Check if column exists in schema. + for (const auto& elem : columnEncryptionProperties) { + auto it = std::find_if(column_path_vec.begin(), column_path_vec.end(), + [&](std::shared_ptr const& p) { + return (p->ToDotString() == elem.first->ToDotString()); + }); + if (it == column_path_vec.end()) { + std::stringstream ss; + ss << "Encrypted column " + elem.first->ToDotString() + " not in file schema"; + throw ParquetException(ss.str()); + } + } + } + file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties, properties_->memory_pool())); if (file_encryption_properties->encrypted_footer()) { diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 62812888f721..942f156feb85 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -75,6 +75,8 @@ class InternalFileDecryptor { void WipeOutDecryptionKeys(); + ::arrow::MemoryPool* pool() { return pool_; } + std::shared_ptr GetFooterDecryptor(); std::shared_ptr GetFooterDecryptorForColumnMeta(const std::string& aad = ""); std::shared_ptr GetFooterDecryptorForColumnData(const std::string& aad = ""); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1f9e456ee76f..ed8b205e7c28 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -515,16 +515,18 @@ class FileMetaData::FileMetaDataImpl { auto aes_encryptor = encryption::AesEncryptor::Make( file_decryptor->algorithm(), static_cast(key.size()), true, NULLPTR); - std::vector encrypted_buffer(aes_encryptor->CiphertextSizeDelta() + - serialized_len); + std::shared_ptr encrypted_buffer = std::static_pointer_cast( + AllocateBuffer(file_decryptor->pool(), + aes_encryptor->CiphertextSizeDelta() + serialized_len)); uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt( serialized_data, serialized_len, str2bytes(key), static_cast(key.size()), - str2bytes(aad), static_cast(aad.size()), nonce, encrypted_buffer.data()); + str2bytes(aad), static_cast(aad.size()), nonce, + encrypted_buffer->mutable_data()); // Delete AES encryptor object. It was created only to verify the footer signature. aes_encryptor->WipeOut(); delete aes_encryptor; return 0 == - memcmp(encrypted_buffer.data() + encrypted_len - encryption::kGcmTagLength, + memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength, tag, encryption::kGcmTagLength); } From ab76ece4f69595ed9cf202326c226ab33234027c Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 18 Jul 2019 07:22:13 +0300 Subject: [PATCH 132/201] Add encryption tests --- cpp/src/parquet/CMakeLists.txt | 4 +- .../parquet/encryption-configurations-test.cc | 787 ------------------ .../encryption-read-configurations-test.cc | 454 ++++++++++ .../encryption-write-configurations-test.cc | 392 +++++++++ cpp/src/parquet/test-encryption-util.cc | 37 + cpp/src/parquet/test-encryption-util.h | 63 ++ 6 files changed, 949 insertions(+), 788 deletions(-) delete mode 100644 cpp/src/parquet/encryption-configurations-test.cc create mode 100644 cpp/src/parquet/encryption-read-configurations-test.cc create mode 100644 cpp/src/parquet/encryption-write-configurations-test.cc create mode 100644 cpp/src/parquet/test-encryption-util.cc create mode 100644 cpp/src/parquet/test-encryption-util.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index b613b999f4dc..246cf215ffbe 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -329,8 +329,10 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES - encryption-configurations-test.cc + encryption-write-configurations-test.cc + encryption-read-configurations-test.cc encryption-properties-test.cc + test-encryption-util.cc test-util.cc) endif() diff --git a/cpp/src/parquet/encryption-configurations-test.cc b/cpp/src/parquet/encryption-configurations-test.cc deleted file mode 100644 index 63e7cccff57f..000000000000 --- a/cpp/src/parquet/encryption-configurations-test.cc +++ /dev/null @@ -1,787 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include - -#include - -#include "parquet/column_reader.h" -#include "parquet/column_writer.h" -#include "parquet/file_reader.h" -#include "parquet/file_writer.h" -#include "parquet/platform.h" -#include "parquet/test-util.h" - -/* - * This file contains unit-tests for writing and reading encrypted Parquet files with - * different encryption and decryption configurations. - * - * Each unit-test produces a single parquet file, encrypted with one of the encryption - * configuration described below; and is read multiple times using a set of decryption - * configurations, also described below. - * - * A detailed description of the Parquet Modular Encryption specification can be found - * here: - * https://github.com/apache/parquet-format/blob/encryption/Encryption.md - * - * Each unit-test creates a single parquet file with eight columns using one of the - * following encryption configurations: - * - * - Encryption configuration 1: Encrypt all columns and the footer with the same key. - * (uniform encryption) - * - Encryption configuration 2: Encrypt two columns and the footer, with different - * keys. - * - Encryption configuration 3: Encrypt two columns, with different keys. - * Don’t encrypt footer (to enable legacy readers) - * - plaintext footer mode. - * - Encryption configuration 4: Encrypt two columns and the footer, with different - * keys. Supply aad_prefix for file identity - * verification. - * - Encryption configuration 5: Encrypt two columns and the footer, with different - * keys. Supply aad_prefix, and call - * disable_aad_prefix_storage to prevent file - * identity storage in file metadata. - * - Encryption configuration 6: Encrypt two columns and the footer, with different - * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. - * - * The written parquet file produced above is read by each of the following decryption - * configurations: - * - * - Decryption configuration 1: Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. - * - Decryption configuration 2: Decrypt using key retriever that holds the keys of - * two encrypted columns and the footer key. Supplies - * aad_prefix to verify file identity. - * - Decryption configuration 3: Decrypt using explicit column and footer keys - * (instead of key retrieval callback). - */ - -namespace parquet { -namespace test { -std::string data_file(const char* file) { - std::string dir_string(test::get_data_dir()); - std::stringstream ss; - ss << dir_string << "/" << file; - return ss.str(); -} - -using FileClass = ::arrow::io::FileOutputStream; - -using parquet::ConvertedType; -using parquet::Repetition; -using parquet::Type; -using schema::GroupNode; -using schema::NodePtr; -using schema::PrimitiveNode; - -constexpr int kFixedLength = 10; - -const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 -const char kColumnEncryptionKey1[] = "1234567890123450"; -const char kColumnEncryptionKey2[] = "1234567890123451"; -const char kFileName[] = "tester"; - -class TestEncryptionConfiguration : public ::testing::Test { - public: - void SetUp() { - createDecryptionConfigurations(); - // Setup the parquet schema - schema_ = SetupEncryptionSchema(); - std::string res = "test.parquet.encrypted"; - file_name_ = data_file(res.c_str()); - } - - void TearDown() { - // delete test file. - ASSERT_EQ(std::remove(file_name_.c_str()), 0); - } - - protected: - std::shared_ptr path_to_double_field_ = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::shared_ptr path_to_float_field_ = - parquet::schema::ColumnPath::FromDotString("float_field"); - std::string file_name_; - int rows_per_rowgroup_ = 50; - std::shared_ptr schema_; - // This vector will hold various decryption configurations. - std::vector> - vector_of_decryption_configurations_; - std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); - std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); - std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); - std::string kFileName_ = std::string(kFileName); - - void createDecryptionConfigurations() { - /********************************************************************************** - Creating a number of Decryption configurations - **********************************************************************************/ - - // Decryption configuration 1: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. - std::shared_ptr string_kr1 = - std::make_shared(); - string_kr1->PutKey("kf", kFooterEncryptionKey_); - string_kr1->PutKey("kc1", kColumnEncryptionKey1_); - string_kr1->PutKey("kc2", kColumnEncryptionKey2_); - std::shared_ptr kr1 = - std::static_pointer_cast(string_kr1); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_1; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_1.key_retriever(kr1)->build()); - - // Decryption configuration 2: Decrypt using key retriever callback that holds the - // keys of two encrypted columns and the footer key. Supply aad_prefix. - std::shared_ptr string_kr2 = - std::make_shared(); - string_kr2->PutKey("kf", kFooterEncryptionKey_); - string_kr2->PutKey("kc1", kColumnEncryptionKey1_); - string_kr2->PutKey("kc2", kColumnEncryptionKey2_); - std::shared_ptr kr2 = - std::static_pointer_cast(string_kr2); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_2; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); - - // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply - // aad_prefix. - std::shared_ptr path_float_ptr = - parquet::schema::ColumnPath::FromDotString("float_field"); - std::shared_ptr path_double_ptr = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - decryption_cols; - parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( - path_double_ptr); - parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); - - decryption_cols[path_double_ptr] = - decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); - decryption_cols[path_float_ptr] = - decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); - - parquet::FileDecryptionProperties::Builder file_decryption_builder_3; - vector_of_decryption_configurations_.push_back( - file_decryption_builder_3.footer_key(kFooterEncryptionKey_) - ->column_properties(decryption_cols) - ->build()); - } - - void EncryptFile( - std::shared_ptr encryption_configurations, - std::string file) { - std::shared_ptr out_file; - - WriterProperties::Builder prop_builder; - prop_builder.compression(parquet::Compression::SNAPPY); - prop_builder.encryption(encryption_configurations); - std::shared_ptr writer_properties = prop_builder.build(); - - PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); - // Create a ParquetFileWriter instance - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); - - RowGroupWriter* row_group_writer; - row_group_writer = file_writer->AppendRowGroup(); - - // Write the Bool column - parquet::BoolWriter* bool_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - bool value = ((i % 2) == 0) ? true : false; - bool_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int32 column - parquet::Int32Writer* int32_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - int32_t value = i; - int32_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Int64 column. Each row has repeats twice. - parquet::Int64Writer* int64_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { - int64_t value = i * 1000 * 1000; - value *= 1000 * 1000; - int16_t definition_level = 1; - int16_t repetition_level = 0; - if ((i % 2) == 0) { - repetition_level = 1; // start of a new record - } - int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); - } - - // Write the INT96 column. - parquet::Int96Writer* int96_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::Int96 value; - value.value[0] = i; - value.value[1] = i + 1; - value.value[2] = i + 2; - int96_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Float column - parquet::FloatWriter* float_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - float value = static_cast(i) * 1.1f; - float_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the Double column - parquet::DoubleWriter* double_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - double value = i * 1.1111111; - double_writer->WriteBatch(1, nullptr, nullptr, &value); - } - - // Write the ByteArray column. Make every alternate values NULL - parquet::ByteArrayWriter* ba_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::ByteArray value; - char hello[kFixedLength] = "parquet"; - hello[7] = static_cast(static_cast('0') + i / 100); - hello[8] = static_cast(static_cast('0') + (i / 10) % 10); - hello[9] = static_cast(static_cast('0') + i % 10); - if (i % 2 == 0) { - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&hello[0]); - value.len = kFixedLength; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - } else { - int16_t definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); - } - } - // Write the FixedLengthByteArray column - parquet::FixedLenByteArrayWriter* flba_writer = - static_cast(row_group_writer->NextColumn()); - for (int i = 0; i < rows_per_rowgroup_; i++) { - parquet::FixedLenByteArray value; - char v = static_cast(i); - char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - value.ptr = reinterpret_cast(&flba[0]); - - flba_writer->WriteBatch(1, nullptr, nullptr, &value); - } - // Close the ParquetFileWriter - file_writer->Close(); - - return; - } - - void DecryptFile(std::string file, int example_id, int encryption_configuration) { - std::string exception_msg; - std::unique_ptr file_reader; - try { - parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption_properties( - vector_of_decryption_configurations_[example_id]->DeepClone()); - - file_reader = parquet::ParquetFileReader::OpenFile(file, false, reader_properties); - - // Get the File MetaData - std::shared_ptr file_metadata = file_reader->metadata(); - - // Get the number of RowGroups - int num_row_groups = file_metadata->num_row_groups(); - ASSERT_EQ(num_row_groups, 1); - - // Get the number of Columns - int num_columns = file_metadata->num_columns(); - ASSERT_EQ(num_columns, 8); - - // Iterate over all the RowGroups in the file - for (int r = 0; r < num_row_groups; ++r) { - // Get the RowGroup Reader - std::shared_ptr row_group_reader = - file_reader->RowGroup(r); - - int64_t values_read = 0; - int64_t rows_read = 0; - int16_t definition_level; - int16_t repetition_level; - int i; - std::shared_ptr column_reader; - - // Get the Column Reader for the boolean column - column_reader = row_group_reader->Column(0); - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - // Read all the rows in the column - i = 0; - while (bool_reader->HasNext()) { - bool value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - bool expected_value = ((i % 2) == 0) ? true : false; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Int32 column - column_reader = row_group_reader->Column(1); - parquet::Int32Reader* int32_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int32_reader->HasNext()) { - int32_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - ASSERT_EQ(value, i); - i++; - } - // Get the Column Reader for the Int64 column - column_reader = row_group_reader->Column(2); - parquet::Int64Reader* int64_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int64_reader->HasNext()) { - int64_t value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, - &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - int64_t expected_value = i * 1000 * 1000; - expected_value *= 1000 * 1000; - ASSERT_EQ(value, expected_value); - if ((i % 2) == 0) { - ASSERT_EQ(repetition_level, 1); - } else { - ASSERT_EQ(repetition_level, 0); - } - i++; - } - - // Get the Column Reader for the Int96 column - column_reader = row_group_reader->Column(3); - parquet::Int96Reader* int96_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (int96_reader->HasNext()) { - parquet::Int96 value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - parquet::Int96 expected_value; - expected_value.value[0] = i; - expected_value.value[1] = i + 1; - expected_value.value[2] = i + 2; - for (int j = 0; j < 3; j++) { - ASSERT_EQ(value.value[j], expected_value.value[j]); - } - i++; - } - - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - double expected_value = i * 1.1111111; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the ByteArray column - column_reader = row_group_reader->Column(6); - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (ba_reader->HasNext()) { - parquet::ByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = - ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // Verify the value written - char expected_value[kFixedLength] = "parquet"; - expected_value[7] = static_cast('0' + i / 100); - expected_value[8] = static_cast('0' + (i / 10) % 10); - expected_value[9] = static_cast('0' + i % 10); - if (i % 2 == 0) { // only alternate values exist - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - ASSERT_EQ(value.len, kFixedLength); - ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); - ASSERT_EQ(definition_level, 1); - } else { - // There are NULL values in the rows written - ASSERT_EQ(values_read, 0); - ASSERT_EQ(definition_level, 0); - } - i++; - } - // Get the Column Reader for the FixedLengthByteArray column - column_reader = row_group_reader->Column(7); - parquet::FixedLenByteArrayReader* flba_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (flba_reader->HasNext()) { - parquet::FixedLenByteArray value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - char v = static_cast(i); - char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; - ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); - i++; - } - } - file_reader->Close(); - } catch (const std::exception& e) { - exception_msg = e.what(); - } - CheckResult(encryption_configuration, example_id, exception_msg); - } - - // Check that the decryption result is as expected. - void CheckResult(int encryption_configuration_number, int example_id, - std::string exception_msg) { - int decryption_configuration_number = example_id + 1; - // Encryption_configuration number five contains aad_prefix and - // disable_aad_prefix_storage. - // An exception is expected to be thrown if the file is not decrypted with aad_prefix. - if (encryption_configuration_number == 5) { - if (decryption_configuration_number == 1 || decryption_configuration_number == 3) { - std::size_t found = exception_msg.find("AAD"); - ASSERT_FALSE(found == std::string::npos); - return; - } - } - // Decryption configuration number two contains aad_prefix. An exception is expected - // to be thrown if the file was not encrypted with the same aad_prefix. - if (decryption_configuration_number == 2) { - if (encryption_configuration_number != 5 && encryption_configuration_number != 4) { - std::size_t found = exception_msg.find("AAD"); - ASSERT_FALSE(found == std::string::npos); - return; - } - } - if (!exception_msg.empty()) { - ASSERT_EQ(1, 0); - } - } - - std::shared_ptr SetupEncryptionSchema() { - parquet::schema::NodeVector fields; - // Create a primitive node named 'boolean_field' with type:BOOLEAN, - // repetition:REQUIRED - fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, ConvertedType::NONE)); - - // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, - // logical type:TIME_MILLIS - fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - ConvertedType::TIME_MILLIS)); - - // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED - fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, - Type::DOUBLE, ConvertedType::NONE)); - - // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL - fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, - Type::BYTE_ARRAY, ConvertedType::NONE)); - - // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, - // repetition:REQUIRED, field_length = kFixedLength - fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, - kFixedLength)); - - // Create a GroupNode named 'schema' using the primitive nodes defined above - // This GroupNode is the root node of the schema tree - return std::static_pointer_cast( - GroupNode::Make("schema", Repetition::REQUIRED, fields)); - } -}; - -// Encryption configuration 1: Encrypt all columns and the footer with the same key. -// (uniform encryption) -TEST_F(TestEncryptionConfiguration, UniformEncryption) { - parquet::FileEncryptionProperties::Builder file_encryption_builder_1( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 1 /* encryption_configuration_number */); - } -} - -// Encryption configuration 2: Encrypt two columns and the footer, with different keys. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols2; - std::shared_ptr path_to_double_field_ = - parquet::schema::ColumnPath::FromDotString("double_field"); - std::shared_ptr path_to_float_field_ = - parquet::schema::ColumnPath::FromDotString("float_field"); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( - path_to_float_field_); - encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); - encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); - - parquet::FileEncryptionProperties::Builder file_encryption_builder_2( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 2 /* encryption_configuration_number */); - } -} - -// Encryption configuration 3: Encrypt two columns, with different keys. -// Don’t encrypt footer. -// (plaintext footer mode, readable by legacy readers) -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols3; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( - path_to_float_field_); - encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); - encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_3( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) - ->set_plaintext_footer() - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 3 /* encryption_configuration_number */); - } -} - -// Encryption configuration 4: Encrypt two columns and the footer, with different keys. -// Use aad_prefix. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols4; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( - path_to_float_field_); - encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); - encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_4( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) - ->aad_prefix(kFileName_) - ->build(), - file_name_); - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 4 /* encryption_configuration_number */); - } -} - -// Encryption configuration 5: Encrypt two columns and the footer, with different keys. -// Use aad_prefix and disable_aad_prefix_storage. -TEST_F(TestEncryptionConfiguration, - EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols5; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( - path_to_float_field_); - encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); - encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_5( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_5.column_properties(encryption_cols5) - ->footer_key_metadata("kf") - ->aad_prefix(kFileName_) - ->disable_store_aad_prefix_storage() - ->build(), - file_name_); - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 5 /* encryption_configuration_number */); - } -} - -// Encryption configuration 6: Encrypt two columns and the footer, with different keys. -// Use AES_GCM_CTR_V1 algorithm. -TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { - std::map, - std::shared_ptr, - parquet::schema::ColumnPath::CmpColumnPath> - encryption_cols6; - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( - path_to_double_field_); - parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( - path_to_float_field_); - encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); - encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); - - encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); - encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); - parquet::FileEncryptionProperties::Builder file_encryption_builder_6( - kFooterEncryptionKey_); - - this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) - ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) - ->build(), - file_name_); - - // Iterate over the decryption configurations and use each one to read the encrypted - // parqeut file. - for (unsigned example_id = 0; example_id < vector_of_decryption_configurations_.size(); - ++example_id) { - DecryptFile(file_name_, example_id, 6 /* encryption_configuration_number */); - } -} - -} // namespace test -} // namespace parquet diff --git a/cpp/src/parquet/encryption-read-configurations-test.cc b/cpp/src/parquet/encryption-read-configurations-test.cc new file mode 100644 index 000000000000..620cfe528610 --- /dev/null +++ b/cpp/src/parquet/encryption-read-configurations-test.cc @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-encryption-util.h" +#include "parquet/test-util.h" + +/* + * This file contains a unit-test for reading encrypted Parquet files with + * different decryption configurations. + * + * The unit-test is called multiple times, each time to decrypt parquet files using + * different decryption configuration as described below. + * In each call two encrypted files are read: one temporary file that was generated using + * encryption-write-configurations-test.cc test and will be deleted upon + * reading it, while the second resides in + * parquet-testing/data repository. Those two encrypted files were encrypted using the + * same encryption configuration. + * The encrypted parquet file names are passed as parameter to the unit-test. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * The following decryption configurations are used to decrypt each parquet file: + * + * - Decryption configuration 1: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. + * - Decryption configuration 2: Decrypt using key retriever that holds the keys of + * two encrypted columns and the footer key. Supplies + * aad_prefix to verify file identity. + * - Decryption configuration 3: Decrypt using explicit column and footer keys + * (instead of key retrieval callback). + * + * The encrypted parquet files that is read was encrypted using one of the configurations + * below: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + + */ + +namespace parquet { +namespace test { + +class TestDecryptionConfiguration + : public testing::TestWithParam> { + public: + void SetUp() { createDecryptionConfigurations(); } + + protected: + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + // This vector will hold various decryption configurations. + std::vector> + vector_of_decryption_configurations_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + void createDecryptionConfigurations() { + /********************************************************************************** + Creating a number of Decryption configurations + **********************************************************************************/ + + // Decryption configuration 1: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. + std::shared_ptr string_kr1 = + std::make_shared(); + string_kr1->PutKey("kf", kFooterEncryptionKey_); + string_kr1->PutKey("kc1", kColumnEncryptionKey1_); + string_kr1->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr1 = + std::static_pointer_cast(string_kr1); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_1; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_1.key_retriever(kr1)->build()); + + // Decryption configuration 2: Decrypt using key retriever callback that holds the + // keys of two encrypted columns and the footer key. Supply aad_prefix. + std::shared_ptr string_kr2 = + std::make_shared(); + string_kr2->PutKey("kf", kFooterEncryptionKey_); + string_kr2->PutKey("kc1", kColumnEncryptionKey1_); + string_kr2->PutKey("kc2", kColumnEncryptionKey2_); + std::shared_ptr kr2 = + std::static_pointer_cast(string_kr2); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_2; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_2.key_retriever(kr2)->aad_prefix(kFileName_)->build()); + + // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply + // aad_prefix. + std::shared_ptr path_float_ptr = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::shared_ptr path_double_ptr = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + decryption_cols; + parquet::ColumnDecryptionProperties::Builder decryption_col_builder31( + path_double_ptr); + parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float_ptr); + + decryption_cols[path_double_ptr] = + decryption_col_builder31.key(kColumnEncryptionKey1_)->build(); + decryption_cols[path_float_ptr] = + decryption_col_builder32.key(kColumnEncryptionKey2_)->build(); + + parquet::FileDecryptionProperties::Builder file_decryption_builder_3; + vector_of_decryption_configurations_.push_back( + file_decryption_builder_3.footer_key(kFooterEncryptionKey_) + ->column_properties(decryption_cols) + ->build()); + } + + void DecryptFile(std::string file, int decryption_config_num) { + std::string exception_msg; + parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[decryption_config_num]->DeepClone()); + + auto file_reader = + parquet::ParquetFileReader::OpenFile(file, false, reader_properties); + + // Get the File MetaData + std::shared_ptr file_metadata = file_reader->metadata(); + + // Get the number of RowGroups + int num_row_groups = file_metadata->num_row_groups(); + ASSERT_EQ(num_row_groups, 1); + + // Get the number of Columns + int num_columns = file_metadata->num_columns(); + ASSERT_EQ(num_columns, 8); + + // Iterate over all the RowGroups in the file + for (int r = 0; r < num_row_groups; ++r) { + // Get the RowGroup Reader + std::shared_ptr row_group_reader = + file_reader->RowGroup(r); + + int64_t values_read = 0; + int64_t rows_read = 0; + int16_t definition_level; + int16_t repetition_level; + int i; + std::shared_ptr column_reader; + + // Get the Column Reader for the boolean column + column_reader = row_group_reader->Column(0); + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + // Read all the rows in the column + i = 0; + while (bool_reader->HasNext()) { + bool value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + bool expected_value = ((i % 2) == 0) ? true : false; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Int32 column + column_reader = row_group_reader->Column(1); + parquet::Int32Reader* int32_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int32_reader->HasNext()) { + int32_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + ASSERT_EQ(value, i); + i++; + } + // Get the Column Reader for the Int64 column + column_reader = row_group_reader->Column(2); + parquet::Int64Reader* int64_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int64_reader->HasNext()) { + int64_t value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level, + &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + int64_t expected_value = i * 1000 * 1000; + expected_value *= 1000 * 1000; + ASSERT_EQ(value, expected_value); + if ((i % 2) == 0) { + ASSERT_EQ(repetition_level, 1); + } else { + ASSERT_EQ(repetition_level, 0); + } + i++; + } + + // Get the Column Reader for the Int96 column + column_reader = row_group_reader->Column(3); + parquet::Int96Reader* int96_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (int96_reader->HasNext()) { + parquet::Int96 value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + parquet::Int96 expected_value; + expected_value.value[0] = i; + expected_value.value[1] = i + 1; + expected_value.value[2] = i + 2; + for (int j = 0; j < 3; j++) { + ASSERT_EQ(value.value[j], expected_value.value[j]); + } + i++; + } + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } + // Get the Column Reader for the ByteArray column + column_reader = row_group_reader->Column(6); + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (ba_reader->HasNext()) { + parquet::ByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = + ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // Verify the value written + char expected_value[kFixedLength] = "parquet"; + expected_value[7] = static_cast('0' + i / 100); + expected_value[8] = static_cast('0' + (i / 10) % 10); + expected_value[9] = static_cast('0' + i % 10); + if (i % 2 == 0) { // only alternate values exist + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + ASSERT_EQ(value.len, kFixedLength); + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + ASSERT_EQ(definition_level, 1); + } else { + // There are NULL values in the rows written + ASSERT_EQ(values_read, 0); + ASSERT_EQ(definition_level, 0); + } + i++; + } + // Get the Column Reader for the FixedLengthByteArray column + column_reader = row_group_reader->Column(7); + parquet::FixedLenByteArrayReader* flba_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (flba_reader->HasNext()) { + parquet::FixedLenByteArray value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + char v = static_cast(i); + char expected_value[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + ASSERT_EQ(memcmp(value.ptr, &expected_value[0], kFixedLength), 0); + i++; + } + } + file_reader->Close(); + } + + // Check that the decryption result is as expected. + void CheckResults(const std::string file_name, unsigned decryption_config_num, + unsigned encryption_config_num) { + // Encryption_configuration number five contains aad_prefix and + // disable_aad_prefix_storage. + // An exception is expected to be thrown if the file is not decrypted with aad_prefix. + if (encryption_config_num == 5) { + if (decryption_config_num == 1 || decryption_config_num == 3) { + EXPECT_THROW(DecryptFile(file_name, decryption_config_num - 1), ParquetException); + return; + } + } + // Decryption configuration number two contains aad_prefix. An exception is expected + // to be thrown if the file was not encrypted with the same aad_prefix. + if (decryption_config_num == 2) { + if (encryption_config_num != 5 && encryption_config_num != 4) { + EXPECT_THROW(DecryptFile(file_name, decryption_config_num - 1), ParquetException); + return; + } + } + EXPECT_NO_THROW(DecryptFile(file_name, decryption_config_num - 1)); + } +}; + +// Read encrypted parquet file. +// The test reads two parquet files that were encrypted using the same encryption +// configuration: +// one was generated in encryption-write-configurations-test.cc tests and is deleted +// once the file is read and the second exists in parquet-testing/data folder. +// The name of the files are passed as parameters to the unit-test. +TEST_P(TestDecryptionConfiguration, TestDecryption) { + int encryption_config_num = std::get<0>(GetParam()); + const char* param_file_name = std::get<1>(GetParam()); + // Decrypt parquet file that was generated in encryption-write-configurations-test.cc + // test. + std::string tmp_file_name = "tmp_" + std::string(param_file_name); + std::string file_name = data_file(tmp_file_name.c_str()); + + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { + unsigned decryption_config_num = index + 1; + CheckResults(file_name, decryption_config_num, encryption_config_num); + } + // delete temporary test file. + ASSERT_EQ(std::remove(file_name.c_str()), 0); + + // Decrypt parquet file that resides in parquet-testing/data directory. + file_name = data_file(param_file_name); + // Iterate over the decryption configurations and use each one to read the encrypted + // parqeut file. + for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { + unsigned decryption_config_num = index + 1; + CheckResults(file_name, decryption_config_num, encryption_config_num); + } +} + +INSTANTIATE_TEST_CASE_P( + decryptionTests, TestDecryptionConfiguration, + ::testing::Values( + std::make_tuple(1, "uniform_encryption.parquet.encrypted"), + std::make_tuple(2, "encrypt_columns_and_footer.parquet.encrypted"), + std::make_tuple(3, "encrypt_columns_plaintext_footer.parquet.encrypted"), + std::make_tuple(4, "encrypt_columns_and_footer_aad.parquet.encrypted"), + std::make_tuple( + 5, "encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"), + std::make_tuple(6, "encrypt_columns_and_footer_ctr.parquet.encrypted"))); + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/encryption-write-configurations-test.cc b/cpp/src/parquet/encryption-write-configurations-test.cc new file mode 100644 index 000000000000..93e28df507dd --- /dev/null +++ b/cpp/src/parquet/encryption-write-configurations-test.cc @@ -0,0 +1,392 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include + +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/platform.h" +#include "parquet/test-encryption-util.h" +#include "parquet/test-util.h" + +/* + * This file contains unit-tests for writing encrypted Parquet files with + * different encryption configurations. + * The files are saved in parquet-testing/data folder and will be deleted after reading + * them in encryption-read-configurations-test.cc test. + * + * A detailed description of the Parquet Modular Encryption specification can be found + * here: + * https://github.com/apache/parquet-format/blob/encryption/Encryption.md + * + * Each unit-test creates a single parquet file with eight columns using one of the + * following encryption configurations: + * + * - Encryption configuration 1: Encrypt all columns and the footer with the same key. + * (uniform encryption) + * - Encryption configuration 2: Encrypt two columns and the footer, with different + * keys. + * - Encryption configuration 3: Encrypt two columns, with different keys. + * Don’t encrypt footer (to enable legacy readers) + * - plaintext footer mode. + * - Encryption configuration 4: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix for file identity + * verification. + * - Encryption configuration 5: Encrypt two columns and the footer, with different + * keys. Supply aad_prefix, and call + * disable_aad_prefix_storage to prevent file + * identity storage in file metadata. + * - Encryption configuration 6: Encrypt two columns and the footer, with different + * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. + */ + +namespace parquet { +namespace test { + +using FileClass = ::arrow::io::FileOutputStream; + +class TestEncryptionConfiguration : public ::testing::Test { + public: + void SetUp() { + // Setup the parquet schema + schema_ = SetupEncryptionSchema(); + } + + protected: + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + std::string file_name_; + int rows_per_rowgroup_ = 50; + std::shared_ptr schema_; + std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); + std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); + std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + std::string kFileName_ = std::string(kFileName); + + void EncryptFile( + std::shared_ptr encryption_configurations, + std::string file_name) { + std::shared_ptr out_file; + + std::string file = data_file(file_name.c_str()); + + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_THROW_NOT_OK(FileClass::Open(file, &out_file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + // Write the Bool column + parquet::BoolWriter* bool_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + bool value = ((i % 2) == 0) ? true : false; + bool_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int32 column + parquet::Int32Writer* int32_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + int32_t value = i; + int32_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Int64 column. Each row has repeats twice. + parquet::Int64Writer* int64_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < 2 * rows_per_rowgroup_; i++) { + int64_t value = i * 1000 * 1000; + value *= 1000 * 1000; + int16_t definition_level = 1; + int16_t repetition_level = 0; + if ((i % 2) == 0) { + repetition_level = 1; // start of a new record + } + int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value); + } + + // Write the INT96 column. + parquet::Int96Writer* int96_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::Int96 value; + value.value[0] = i; + value.value[1] = i + 1; + value.value[2] = i + 2; + int96_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Float column + parquet::FloatWriter* float_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + float value = static_cast(i) * 1.1f; + float_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the Double column + parquet::DoubleWriter* double_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + double value = i * 1.1111111; + double_writer->WriteBatch(1, nullptr, nullptr, &value); + } + + // Write the ByteArray column. Make every alternate values NULL + parquet::ByteArrayWriter* ba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::ByteArray value; + char hello[kFixedLength] = "parquet"; + hello[7] = static_cast(static_cast('0') + i / 100); + hello[8] = static_cast(static_cast('0') + (i / 10) % 10); + hello[9] = static_cast(static_cast('0') + i % 10); + if (i % 2 == 0) { + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&hello[0]); + value.len = kFixedLength; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + } else { + int16_t definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); + } + } + // Write the FixedLengthByteArray column + parquet::FixedLenByteArrayWriter* flba_writer = + static_cast(row_group_writer->NextColumn()); + for (int i = 0; i < rows_per_rowgroup_; i++) { + parquet::FixedLenByteArray value; + char v = static_cast(i); + char flba[kFixedLength] = {v, v, v, v, v, v, v, v, v, v}; + value.ptr = reinterpret_cast(&flba[0]); + + flba_writer->WriteBatch(1, nullptr, nullptr, &value); + } + // Close the ParquetFileWriter + file_writer->Close(); + + return; + } + + std::shared_ptr SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } +}; + +// Encryption configuration 1: Encrypt all columns and the footer with the same key. +// (uniform encryption) +TEST_F(TestEncryptionConfiguration, UniformEncryption) { + parquet::FileEncryptionProperties::Builder file_encryption_builder_1( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_1.footer_key_metadata("kf")->build(), + "tmp_uniform_encryption.parquet.encrypted"); +} + +// Encryption configuration 2: Encrypt two columns and the footer, with different keys. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols2; + std::shared_ptr path_to_double_field_ = + parquet::schema::ColumnPath::FromDotString("double_field"); + std::shared_ptr path_to_float_field_ = + parquet::schema::ColumnPath::FromDotString("float_field"); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21( + path_to_float_field_); + encryption_col_builder_20.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_21.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols2[path_to_double_field_] = encryption_col_builder_20.build(); + encryption_cols2[path_to_float_field_] = encryption_col_builder_21.build(); + + parquet::FileEncryptionProperties::Builder file_encryption_builder_2( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") + ->column_properties(encryption_cols2) + ->build(), + "tmp_encrypt_columns_and_footer.parquet.encrypted"); +} + +// Encryption configuration 3: Encrypt two columns, with different keys. +// Don’t encrypt footer. +// (plaintext footer mode, readable by legacy readers) +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols3; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31( + path_to_float_field_); + encryption_col_builder_30.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_31.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols3[path_to_double_field_] = encryption_col_builder_30.build(); + encryption_cols3[path_to_float_field_] = encryption_col_builder_31.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_3( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") + ->column_properties(encryption_cols3) + ->set_plaintext_footer() + ->build(), + "tmp_encrypt_columns_plaintext_footer.parquet.encrypted"); +} + +// Encryption configuration 4: Encrypt two columns and the footer, with different keys. +// Use aad_prefix. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols4; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41( + path_to_float_field_); + encryption_col_builder_40.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_41.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols4[path_to_double_field_] = encryption_col_builder_40.build(); + encryption_cols4[path_to_float_field_] = encryption_col_builder_41.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_4( + kFooterEncryptionKey_); + + this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") + ->column_properties(encryption_cols4) + ->aad_prefix(kFileName_) + ->build(), + "tmp_encrypt_columns_and_footer_aad.parquet.encrypted"); +} + +// Encryption configuration 5: Encrypt two columns and the footer, with different keys. +// Use aad_prefix and disable_aad_prefix_storage. +TEST_F(TestEncryptionConfiguration, + EncryptTwoColumnsAndFooterWithAadPrefixDisable_aad_prefix_storage) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols5; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51( + path_to_float_field_); + encryption_col_builder_50.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_51.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols5[path_to_double_field_] = encryption_col_builder_50.build(); + encryption_cols5[path_to_float_field_] = encryption_col_builder_51.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_5( + kFooterEncryptionKey_); + + this->EncryptFile( + file_encryption_builder_5.column_properties(encryption_cols5) + ->footer_key_metadata("kf") + ->aad_prefix(kFileName_) + ->disable_store_aad_prefix_storage() + ->build(), + "tmp_encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"); +} + +// Encryption configuration 6: Encrypt two columns and the footer, with different keys. +// Use AES_GCM_CTR_V1 algorithm. +TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { + std::map, + std::shared_ptr, + parquet::schema::ColumnPath::CmpColumnPath> + encryption_cols6; + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60( + path_to_double_field_); + parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61( + path_to_float_field_); + encryption_col_builder_60.key(kColumnEncryptionKey1_)->key_id("kc1"); + encryption_col_builder_61.key(kColumnEncryptionKey2_)->key_id("kc2"); + + encryption_cols6[path_to_double_field_] = encryption_col_builder_60.build(); + encryption_cols6[path_to_float_field_] = encryption_col_builder_61.build(); + parquet::FileEncryptionProperties::Builder file_encryption_builder_6( + kFooterEncryptionKey_); + + EXPECT_NO_THROW( + this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") + ->column_properties(encryption_cols6) + ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) + ->build(), + "tmp_encrypt_columns_and_footer_ctr.parquet.encrypted")); +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/test-encryption-util.cc b/cpp/src/parquet/test-encryption-util.cc new file mode 100644 index 000000000000..a1cc914205be --- /dev/null +++ b/cpp/src/parquet/test-encryption-util.cc @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#include "parquet/test-encryption-util.h" + +#include "parquet/test-util.h" + +namespace parquet { +namespace test { + +std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/test-encryption-util.h b/cpp/src/parquet/test-encryption-util.h new file mode 100644 index 000000000000..547e9f69a84f --- /dev/null +++ b/cpp/src/parquet/test-encryption-util.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/testing/util.h" + +#include "parquet/column_page.h" +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/encoding.h" +#include "parquet/platform.h" + +namespace parquet { +namespace test { + +std::string data_file(const char* file); + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +constexpr int kFixedLength = 10; + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +} // namespace test +} // namespace parquet From 657609fdc609cd6229fd37a58462cb33d0ccfd34 Mon Sep 17 00:00:00 2001 From: Revital1 Eres Date: Thu, 18 Jul 2019 11:04:45 +0300 Subject: [PATCH 133/201] Throw exception when files are missing from parquet-testing repo --- .../encryption-read-configurations-test.cc | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encryption-read-configurations-test.cc b/cpp/src/parquet/encryption-read-configurations-test.cc index 620cfe528610..5b1e9175ff29 100644 --- a/cpp/src/parquet/encryption-read-configurations-test.cc +++ b/cpp/src/parquet/encryption-read-configurations-test.cc @@ -18,8 +18,9 @@ #include #include +#include -#include +#include "arrow/io/file.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" @@ -404,6 +405,12 @@ class TestDecryptionConfiguration } EXPECT_NO_THROW(DecryptFile(file_name, decryption_config_num - 1)); } + + // Returns true if file exists. Otherwise returns false. + bool fexists(const std::string& filename) { + std::ifstream ifile(filename.c_str()); + return ifile.good(); + } }; // Read encrypted parquet file. @@ -419,6 +426,11 @@ TEST_P(TestDecryptionConfiguration, TestDecryption) { // test. std::string tmp_file_name = "tmp_" + std::string(param_file_name); std::string file_name = data_file(tmp_file_name.c_str()); + if (!fexists(file_name)) { + std::stringstream ss; + ss << "File " << file_name << " is missing from parquet-testing repo."; + throw ParquetTestException(ss.str()); + } // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. @@ -426,11 +438,18 @@ TEST_P(TestDecryptionConfiguration, TestDecryption) { unsigned decryption_config_num = index + 1; CheckResults(file_name, decryption_config_num, encryption_config_num); } - // delete temporary test file. + // Delete temporary test file. ASSERT_EQ(std::remove(file_name.c_str()), 0); // Decrypt parquet file that resides in parquet-testing/data directory. file_name = data_file(param_file_name); + + if (!fexists(file_name)) { + std::stringstream ss; + ss << "File " << file_name << " is missing from parquet-testing repo."; + throw ParquetTestException(ss.str()); + } + // Iterate over the decryption configurations and use each one to read the encrypted // parqeut file. for (unsigned index = 0; index < vector_of_decryption_configurations_.size(); ++index) { From f466781a53398e70ff9851a8319a24d15d84004c Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Fri, 19 Jul 2019 11:26:43 +0700 Subject: [PATCH 134/201] update parquet_testing submodule with new encrypted files --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 2fc3ade4ccbf..a277dc4e55de 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 2fc3ade4ccbf17271194df0b1549bc6733204314 +Subproject commit a277dc4e55ded3e3ea27dab1e4faf98c112442df From 55cd2bf5da1aa231476ffa5f7da904c59e4790eb Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 25 Jul 2019 10:22:03 -0400 Subject: [PATCH 135/201] add crypto dependency to R build --- r/configure.win | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/r/configure.win b/r/configure.win index b7bb0ff8fc9d..046fdcf4754d 100644 --- a/r/configure.win +++ b/r/configure.win @@ -31,6 +31,12 @@ VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) # If binary not found, script exits nonzero if [ $? -ne 0 ]; then echo "Arrow C++ library was not found" +else + # Set the right flags to point to and enable arrow/parquet + RWINLIB="../windows/arrow-${VERSION}" + OPENSSL_LIBS="-lcrypto" + PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" + PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lws2_32 ${OPENSSL_LIBS}" fi # Set the right flags to point to and enable arrow/parquet From 1a508f2d467eca57b3387495c15ee61e7cb851f8 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 25 Jul 2019 11:58:54 -0400 Subject: [PATCH 136/201] Print location of OpenSSL library --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index a21f0aa87c20..3afac9e605cc 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -923,6 +923,7 @@ elseif(ARROW_PARQUET) endif() if(ARROW_USE_OPENSSL) + message(STATUS "Found OpenSSL Crypto Library: ${OPENSSL_CRYPTO_LIBRARY}") message(STATUS "Building with OpenSSL (Version: ${OPENSSL_VERSION}) support") # OpenSSL::SSL and OpenSSL::Crypto were not added to From 01b4c486d7c33457fb8a6d6094656933fdfcaddd Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 25 Jul 2019 15:10:37 -0400 Subject: [PATCH 137/201] try adding crypto dependency to R build again --- r/configure.win | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/r/configure.win b/r/configure.win index 046fdcf4754d..bd79dab24c5d 100644 --- a/r/configure.win +++ b/r/configure.win @@ -34,9 +34,8 @@ if [ $? -ne 0 ]; then else # Set the right flags to point to and enable arrow/parquet RWINLIB="../windows/arrow-${VERSION}" - OPENSSL_LIBS="-lcrypto" PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" - PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lws2_32 ${OPENSSL_LIBS}" + PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lcrypto -lws2_32" fi # Set the right flags to point to and enable arrow/parquet From bccb0fe43b87b74d15e3aca80fbac9da3a0f4001 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Fri, 26 Jul 2019 14:17:30 -0400 Subject: [PATCH 138/201] add missing crypto deps --- ci/windows-pkg-arrow-for-r.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/windows-pkg-arrow-for-r.sh b/ci/windows-pkg-arrow-for-r.sh index f5abdb68c563..6f76aa66be39 100644 --- a/ci/windows-pkg-arrow-for-r.sh +++ b/ci/windows-pkg-arrow-for-r.sh @@ -54,6 +54,10 @@ wget https://dl.bintray.com/rtools/backports/mingw-w64-x86_64-snappy-1.1.7-2-any # double-conversion is only available in the Rtools4.0 builds, but apparently that's ok wget https://dl.bintray.com/rtools/mingw64/mingw-w64-x86_64-double-conversion-3.1.2-1-any.pkg.tar.xz wget https://dl.bintray.com/rtools/mingw32/mingw-w64-i686-double-conversion-3.1.2-1-any.pkg.tar.xz + +wget https://dl.bintray.com/rtools/mingw64/:mingw-w64-x86_64-openssl-1.1.1.a-1-any.pkg.tar.xz +wget https://dl.bintray.com/rtools/mingw32/:mingw-w64-i686-openssl-1.1.1.a-1-any.pkg.tar.xz + ls | xargs -n 1 tar -xJf cd .. From cf74661a11cbccf4461afb1e037582dc8bb26ad0 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 29 Jul 2019 16:02:40 -0400 Subject: [PATCH 139/201] fix ci openssl url --- ci/windows-pkg-arrow-for-r.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/windows-pkg-arrow-for-r.sh b/ci/windows-pkg-arrow-for-r.sh index 6f76aa66be39..e4badbef2710 100644 --- a/ci/windows-pkg-arrow-for-r.sh +++ b/ci/windows-pkg-arrow-for-r.sh @@ -55,8 +55,8 @@ wget https://dl.bintray.com/rtools/backports/mingw-w64-x86_64-snappy-1.1.7-2-any wget https://dl.bintray.com/rtools/mingw64/mingw-w64-x86_64-double-conversion-3.1.2-1-any.pkg.tar.xz wget https://dl.bintray.com/rtools/mingw32/mingw-w64-i686-double-conversion-3.1.2-1-any.pkg.tar.xz -wget https://dl.bintray.com/rtools/mingw64/:mingw-w64-x86_64-openssl-1.1.1.a-1-any.pkg.tar.xz -wget https://dl.bintray.com/rtools/mingw32/:mingw-w64-i686-openssl-1.1.1.a-1-any.pkg.tar.xz +wget https://dl.bintray.com/rtools/mingw64/mingw-w64-x86_64-openssl-1.1.1.a-1-any.pkg.tar.xz +wget https://dl.bintray.com/rtools/mingw32/mingw-w64-i686-openssl-1.1.1.a-1-any.pkg.tar.xz ls | xargs -n 1 tar -xJf cd .. From 75915e8414f94caea1b42d9f67db48992a3138ea Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 30 Jul 2019 09:03:37 -0400 Subject: [PATCH 140/201] add crypt32 lib --- r/configure.win | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/configure.win b/r/configure.win index bd79dab24c5d..c22cb08822e2 100644 --- a/r/configure.win +++ b/r/configure.win @@ -34,8 +34,9 @@ if [ $? -ne 0 ]; then else # Set the right flags to point to and enable arrow/parquet RWINLIB="../windows/arrow-${VERSION}" + OPENSSL_LIBS="-lcrypto -lcrypt32" PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" - PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lcrypto -lws2_32" + PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz ${OPENSSL_LIBS} -lws2_32" fi # Set the right flags to point to and enable arrow/parquet From 8e148c2c19f31ab7a6c72d7b9190f8f16d6aecd2 Mon Sep 17 00:00:00 2001 From: Gal Lushi Date: Sun, 18 Aug 2019 17:30:30 +0300 Subject: [PATCH 141/201] Applying revital's const-fix patch & Addressing Deepak's review comment for the 4th decryption config --- .../encryption-read-configurations-test.cc | 102 +++++++++++------- cpp/src/parquet/encryption.h | 4 +- cpp/src/parquet/encryption_internal.cc | 85 ++++++++------- cpp/src/parquet/encryption_internal.h | 14 +-- 4 files changed, 115 insertions(+), 90 deletions(-) diff --git a/cpp/src/parquet/encryption-read-configurations-test.cc b/cpp/src/parquet/encryption-read-configurations-test.cc index 5b1e9175ff29..f520403071f9 100644 --- a/cpp/src/parquet/encryption-read-configurations-test.cc +++ b/cpp/src/parquet/encryption-read-configurations-test.cc @@ -56,6 +56,9 @@ * aad_prefix to verify file identity. * - Decryption configuration 3: Decrypt using explicit column and footer keys * (instead of key retrieval callback). + * - Decryption Configuration 4: PlainText Footer mode - test legacy reads, + * read the footer + all non-encrypted columns. + * (pairs with encryption configuration 3) * * The encrypted parquet files that is read was encrypted using one of the configurations * below: @@ -157,13 +160,20 @@ class TestDecryptionConfiguration file_decryption_builder_3.footer_key(kFooterEncryptionKey_) ->column_properties(decryption_cols) ->build()); + + // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext columns. + vector_of_decryption_configurations_.push_back(NULL); } void DecryptFile(std::string file, int decryption_config_num) { std::string exception_msg; parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); - reader_properties.file_decryption_properties( - vector_of_decryption_configurations_[decryption_config_num]->DeepClone()); + // if we get decryption_config_num = x then it means the actual number is x+1 + // and since we want decryption_config_num=4 we set the condition to 3 + if(decryption_config_num != 3){ + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[decryption_config_num]->DeepClone()); + } auto file_reader = parquet::ParquetFileReader::OpenFile(file, false, reader_properties); @@ -286,46 +296,51 @@ class TestDecryptionConfiguration i++; } - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - ASSERT_EQ(value, expected_value); - i++; - } - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - double expected_value = i * 1.1111111; - ASSERT_EQ(value, expected_value); - i++; + if(decryption_config_num != 3){ + + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } } + // Get the Column Reader for the ByteArray column column_reader = row_group_reader->Column(6); parquet::ByteArrayReader* ba_reader = @@ -403,6 +418,11 @@ class TestDecryptionConfiguration return; } } + + // decryption config 4 can only work when the encryption configuration is 3 + if(decryption_config_num == 4 && encryption_config_num != 3){ + return; + } EXPECT_NO_THROW(DecryptFile(file_name, decryption_config_num - 1)); } diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 23ef5a362b41..4b0eb8a46b76 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -88,11 +88,11 @@ class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException { : ParquetException(columnPath.c_str()) {} }; -inline uint8_t* str2bytes(const std::string& str) { +inline const uint8_t* str2bytes(const std::string& str) { if (str.empty()) return NULLPTR; char* cbytes = const_cast(str.c_str()); - return reinterpret_cast(cbytes); + return reinterpret_cast(cbytes); } class PARQUET_EXPORT ColumnEncryptionProperties { diff --git a/cpp/src/parquet/encryption_internal.cc b/cpp/src/parquet/encryption_internal.cc index 77a32fc10737..6bd4bedf11af 100644 --- a/cpp/src/parquet/encryption_internal.cc +++ b/cpp/src/parquet/encryption_internal.cc @@ -60,12 +60,12 @@ class AesEncryptor::AesEncryptorImpl { } } - int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* ciphertext); + int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext); - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + const uint8_t* nonce, uint8_t* encrypted_footer); void WipeOut() { if (NULLPTR != ctx_) { EVP_CIPHER_CTX_free(ctx_); @@ -81,11 +81,12 @@ class AesEncryptor::AesEncryptorImpl { int key_length_; int ciphertext_size_delta_; - int GcmEncrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* nonce, uint8_t* aad, int aad_len, uint8_t* ciphertext); + int GcmEncrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, const uint8_t* aad, int aad_len, + uint8_t* ciphertext); - int CtrEncrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* nonce, uint8_t* ciphertext); + int CtrEncrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, uint8_t* ciphertext); }; AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int key_len, @@ -134,11 +135,9 @@ AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int } } -int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(const uint8_t* footer, - int footer_len, uint8_t* key, - int key_len, uint8_t* aad, - int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer) { +int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt( + const uint8_t* footer, int footer_len, const uint8_t* key, int key_len, + const uint8_t* aad, int aad_len, const uint8_t* nonce, uint8_t* encrypted_footer) { if (key_length_ != key_len) { std::stringstream ss; ss << "Wrong key length " << key_len << ". Should be " << key_length_; @@ -154,8 +153,9 @@ int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(const uint8_t* footer, } int AesEncryptor::AesEncryptorImpl::Encrypt(const uint8_t* plaintext, int plaintext_len, - uint8_t* key, int key_len, uint8_t* aad, - int aad_len, uint8_t* ciphertext) { + const uint8_t* key, int key_len, + const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { if (key_length_ != key_len) { std::stringstream ss; ss << "Wrong key length " << key_len << ". Should be " << key_length_; @@ -176,9 +176,10 @@ int AesEncryptor::AesEncryptorImpl::Encrypt(const uint8_t* plaintext, int plaint } int AesEncryptor::AesEncryptorImpl::GcmEncrypt(const uint8_t* plaintext, - int plaintext_len, uint8_t* key, - int key_len, uint8_t* nonce, uint8_t* aad, - int aad_len, uint8_t* ciphertext) { + int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, + const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { int len; int ciphertext_len; @@ -230,8 +231,8 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(const uint8_t* plaintext, } int AesEncryptor::AesEncryptorImpl::CtrEncrypt(const uint8_t* plaintext, - int plaintext_len, uint8_t* key, - int key_len, uint8_t* nonce, + int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* nonce, uint8_t* ciphertext) { int len; int ciphertext_len; @@ -279,9 +280,10 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(const uint8_t* plaintext, AesEncryptor::~AesEncryptor() {} -int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, - uint8_t* nonce, uint8_t* encrypted_footer) { +int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + const uint8_t* key, int key_len, const uint8_t* aad, + int aad_len, const uint8_t* nonce, + uint8_t* encrypted_footer) { return impl_->SignedFooterEncrypt(footer, footer_len, key, key_len, aad, aad_len, nonce, encrypted_footer); } @@ -290,8 +292,9 @@ void AesEncryptor::WipeOut() { impl_->WipeOut(); } int AesEncryptor::CiphertextSizeDelta() { return impl_->ciphertext_size_delta(); } -int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { +int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { return impl_->Encrypt(plaintext, plaintext_len, key, key_len, aad, aad_len, ciphertext); } @@ -310,8 +313,8 @@ class AesDecryptor::AesDecryptorImpl { } } - int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); void WipeOut() { if (NULLPTR != ctx_) { @@ -327,15 +330,16 @@ class AesDecryptor::AesDecryptorImpl { int aes_mode_; int key_length_; int ciphertext_size_delta_; - int GcmDecrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); + int GcmDecrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); - int CtrDecrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* plaintext); + int CtrDecrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, uint8_t* plaintext); }; -int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { +int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { return impl_->Decrypt(plaintext, plaintext_len, key, key_len, aad, aad_len, ciphertext); } @@ -424,9 +428,9 @@ AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool m int AesDecryptor::CiphertextSizeDelta() { return impl_->ciphertext_size_delta(); } int AesDecryptor::AesDecryptorImpl::GcmDecrypt(const uint8_t* ciphertext, - int ciphertext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, - uint8_t* plaintext) { + int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, + int aad_len, uint8_t* plaintext) { int len; int plaintext_len; @@ -486,7 +490,7 @@ int AesDecryptor::AesDecryptorImpl::GcmDecrypt(const uint8_t* ciphertext, } int AesDecryptor::AesDecryptorImpl::CtrDecrypt(const uint8_t* ciphertext, - int ciphertext_len, uint8_t* key, + int ciphertext_len, const uint8_t* key, int key_len, uint8_t* plaintext) { int len; int plaintext_len; @@ -538,8 +542,9 @@ int AesDecryptor::AesDecryptorImpl::CtrDecrypt(const uint8_t* ciphertext, } int AesDecryptor::AesDecryptorImpl::Decrypt(const uint8_t* ciphertext, int ciphertext_len, - uint8_t* key, int key_len, uint8_t* aad, - int aad_len, uint8_t* plaintext) { + const uint8_t* key, int key_len, + const uint8_t* aad, int aad_len, + uint8_t* plaintext) { if (key_length_ != key_len) { std::stringstream ss; ss << "Wrong key length " << key_len << ". Should be " << key_length_; diff --git a/cpp/src/parquet/encryption_internal.h b/cpp/src/parquet/encryption_internal.h index ec30cd5dd902..e6ade70f7e02 100644 --- a/cpp/src/parquet/encryption_internal.h +++ b/cpp/src/parquet/encryption_internal.h @@ -56,13 +56,13 @@ class AesEncryptor { /// Encrypts plaintext with the key and aad. Key length is passed only for validation. /// If different from value in constructor, exception will be thrown. - int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* ciphertext); + int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext); /// Encrypts plaintext footer, in order to compute footer signature (tag). - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer); + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + const uint8_t* nonce, uint8_t* encrypted_footer); void WipeOut(); @@ -88,8 +88,8 @@ class AesDecryptor { /// Decrypts ciphertext with the key and aad. Key length is passed only for /// validation. If different from value in constructor, exception will be thrown. - int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext); private: /// Can serve one key length only. Possible values: 16, 24, 32 bytes. From e9be8051f5362f34a69ab7b613cd68bc8d421e79 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 20 Aug 2019 14:10:35 +0700 Subject: [PATCH 142/201] post-rebase change --- cpp/src/parquet/file_writer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 8955acd174e1..6d73bfb01481 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -136,7 +136,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { : NULLPTR; std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, - row_group_ordinal_, static_cast(current_column_index_ - 1), + row_group_ordinal_, static_cast(next_column_index_ - 1), properties_->memory_pool(), false, meta_encryptor, data_encryptor); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); return column_writers_[0].get(); @@ -244,7 +244,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { std::unique_ptr pager = PageWriter::Open( sink_, properties_->compression(column_descr->path()), col_meta, static_cast(row_group_ordinal_), - static_cast(current_column_index_), properties_->memory_pool(), + static_cast(next_column_index_), properties_->memory_pool(), buffered_row_group_, meta_encryptor, data_encryptor); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_)); From 9fa19676477c39822659b9ab3cb2724b99180ea3 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Tue, 20 Aug 2019 23:10:04 +0700 Subject: [PATCH 143/201] fix comments --- ...yption-reader-writer-all-crypto-options.cc | 52 +++++++-- ...ryption-reader-writer-all-crypto-options.h | 54 ---------- .../low-level-api/encryption-reader-writer.cc | 2 +- cpp/src/parquet/CMakeLists.txt | 1 - cpp/src/parquet/column_writer.cc | 15 ++- cpp/src/parquet/encryption-properties-test.cc | 34 +++--- .../encryption-read-configurations-test.cc | 102 +++++++++--------- .../encryption-write-configurations-test.cc | 12 +-- cpp/src/parquet/encryption.cc | 74 +++++++------ cpp/src/parquet/encryption.h | 47 ++++---- cpp/src/parquet/file_writer.cc | 8 +- cpp/src/parquet/internal_file_encryptor.cc | 2 +- cpp/src/parquet/properties.h | 2 +- cpp/src/parquet/test-encryption-util.cc | 37 ------- cpp/src/parquet/test-encryption-util.h | 10 +- 15 files changed, 202 insertions(+), 250 deletions(-) delete mode 100644 cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h delete mode 100644 cpp/src/parquet/test-encryption-util.cc diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 06d43be8f5a0..710c3c05829a 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -23,7 +23,12 @@ #include #include -#include +#include +#include + +#include +#include + /* * This file contains samples for writing and reading encrypted Parquet files in different @@ -95,6 +100,13 @@ const std::string kColumnEncryptionKey2 = "1234567890123451"; const std::string fileName = "tester"; using FileClass = ::arrow::io::FileOutputStream; +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; + +constexpr int FIXED_LENGTH = 10; void PrintDecryptionConfiguration(int configuration); // Check that the decryption result is as expected. @@ -118,6 +130,30 @@ std::vector GetDirectoryFiles(const std::string& path) { return files; } +static std::shared_ptr SetupSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, + ConvertedType::NONE)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} + void InteropTestWriteEncryptedParquetFiles(std::string root_path) { /********************************************************************************** Creating a number of Encryption configurations @@ -157,7 +193,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) + ->encrypted_columns(encryption_cols2) ->build()); // Encryption configuration 3: Encrypt two columns, with different keys. @@ -179,7 +215,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) + ->encrypted_columns(encryption_cols3) ->set_plaintext_footer() ->build()); @@ -201,7 +237,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) + ->encrypted_columns(encryption_cols4) ->aad_prefix(fileName) ->build()); @@ -222,10 +258,10 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { kFooterEncryptionKey); vector_of_encryption_configurations.push_back( - file_encryption_builder_5.column_properties(encryption_cols5) + file_encryption_builder_5.encrypted_columns(encryption_cols5) ->footer_key_metadata("kf") ->aad_prefix(fileName) - ->disable_store_aad_prefix_storage() + ->disable_aad_prefix_storage() ->build()); // Encryption configuration 6: Encrypt two columns and the footer, with different keys. @@ -246,7 +282,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) + ->encrypted_columns(encryption_cols6) ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) ->build()); @@ -389,7 +425,7 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { parquet::FileDecryptionProperties::Builder file_decryption_builder_3; vector_of_decryption_configurations.push_back( file_decryption_builder_3.footer_key(kFooterEncryptionKey) - ->column_properties(decryption_cols) + ->column_keys(decryption_cols) ->build()); /********************************************************************************** diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h deleted file mode 100644 index 2ca3a064768a..000000000000 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include -#include - -using parquet::ConvertedType; -using parquet::Repetition; -using parquet::Type; -using parquet::schema::GroupNode; -using parquet::schema::PrimitiveNode; - -constexpr int FIXED_LENGTH = 10; - -static std::shared_ptr SetupSchema() { - parquet::schema::NodeVector fields; - // Create a primitive node named 'boolean_field' with type:BOOLEAN, - // repetition:REQUIRED - fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED, - Type::BOOLEAN, ConvertedType::NONE)); - - // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, - // logical type:TIME_MILLIS - fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32, - ConvertedType::TIME_MILLIS)); - - fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT, - ConvertedType::NONE)); - - fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE, - ConvertedType::NONE)); - - // Create a GroupNode named 'schema' using the primitive nodes defined above - // This GroupNode is the root node of the schema tree - return std::static_pointer_cast( - GroupNode::Make("schema", Repetition::REQUIRED, fields)); -} diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc index 5ce66769c0fb..6445737e56e8 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer.cc @@ -84,7 +84,7 @@ int main(int argc, char** argv) { // Add the current encryption configuration to WriterProperties. builder.encryption(file_encryption_builder.footer_key_metadata("kf") - ->column_properties(encryption_cols) + ->encrypted_columns(encryption_cols) ->build()); std::shared_ptr props = builder.build(); diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 246cf215ffbe..e76fd07484a0 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -332,7 +332,6 @@ if(ARROW_USE_OPENSSL) encryption-write-configurations-test.cc encryption-read-configurations-test.cc encryption-properties-test.cc - test-encryption-util.cc test-util.cc) endif() diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 6c687776e973..b301e701996a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -138,9 +138,9 @@ int LevelEncoder::Encode(int batch_size, const int16_t* levels) { // and the page metadata. class SerializedPageWriter : public PageWriter { public: - SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t column_chunk_ordinal, + SerializedPageWriter(const std::shared_ptr& sink, + Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool = arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) @@ -401,9 +401,8 @@ class SerializedPageWriter : public PageWriter { class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, - Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, - int16_t current_column_ordinal, + Compression::type codec, ColumnChunkMetaDataBuilder* metadata, + int16_t row_group_ordinal, int16_t current_column_ordinal, MemoryPool* pool = arrow::default_memory_pool(), std::shared_ptr meta_encryptor = NULLPTR, std::shared_ptr data_encryptor = NULLPTR) @@ -458,8 +457,8 @@ std::unique_ptr PageWriter::Open( const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool, bool buffered_row_group, - std::shared_ptr meta_encryptor, std::shared_ptr data_encryptor -) { + std::shared_ptr meta_encryptor, + std::shared_ptr data_encryptor) { if (buffered_row_group) { return std::unique_ptr(new BufferedPageWriter( sink, codec, metadata, row_group_ordinal, column_chunk_ordinal, pool, diff --git a/cpp/src/parquet/encryption-properties-test.cc b/cpp/src/parquet/encryption-properties-test.cc index 088a2c08bf88..f003b0621a9b 100644 --- a/cpp/src/parquet/encryption-properties-test.cc +++ b/cpp/src/parquet/encryption-properties-test.cc @@ -73,7 +73,7 @@ TEST(TestEncryptionProperties, UniformEncryption) { std::shared_ptr column_path = parquet::schema::ColumnPath::FromDotString("a_column"); std::shared_ptr out_col_props = - props->column_properties(column_path); + props->column_encryption_properties(column_path); ASSERT_EQ(true, out_col_props->is_encrypted()); ASSERT_EQ(true, out_col_props->is_encrypted_with_footer_key()); @@ -96,13 +96,13 @@ TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties; - column_properties[column_path_1] = column_builder_1.build(); - column_properties[column_path_2] = column_builder_2.build(); + encrypted_columns; + encrypted_columns[column_path_1] = column_builder_1.build(); + encrypted_columns[column_path_2] = column_builder_2.build(); FileEncryptionProperties::Builder builder(kFooterEncryptionKey); builder.footer_key_metadata("kf"); - builder.column_properties(column_properties); + builder.encrypted_columns(encrypted_columns); std::shared_ptr props = builder.build(); ASSERT_EQ(true, props->encrypted_footer()); @@ -110,7 +110,7 @@ TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); std::shared_ptr out_col_props_1 = - props->column_properties(column_path_1); + props->column_encryption_properties(column_path_1); ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); ASSERT_EQ(true, out_col_props_1->is_encrypted()); @@ -119,7 +119,7 @@ TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { ASSERT_EQ("kc1", out_col_props_1->key_metadata()); std::shared_ptr out_col_props_2 = - props->column_properties(column_path_2); + props->column_encryption_properties(column_path_2); ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); ASSERT_EQ(true, out_col_props_2->is_encrypted()); @@ -130,7 +130,7 @@ TEST(TestEncryptionProperties, EncryptFooterAndTwoColumns) { std::shared_ptr column_path_3 = parquet::schema::ColumnPath::FromDotString("column_3"); std::shared_ptr out_col_props_3 = - props->column_properties(column_path_3); + props->column_encryption_properties(column_path_3); ASSERT_EQ(NULLPTR, out_col_props_3); } @@ -152,14 +152,14 @@ TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { std::map, std::shared_ptr, schema::ColumnPath::CmpColumnPath> - column_properties; - column_properties[column_path_1] = column_builder_1.build(); - column_properties[column_path_2] = column_builder_2.build(); + encrypted_columns; + encrypted_columns[column_path_1] = column_builder_1.build(); + encrypted_columns[column_path_2] = column_builder_2.build(); FileEncryptionProperties::Builder builder(kFooterEncryptionKey); builder.footer_key_metadata("kf"); builder.set_plaintext_footer(); - builder.column_properties(column_properties); + builder.encrypted_columns(encrypted_columns); std::shared_ptr props = builder.build(); ASSERT_EQ(false, props->encrypted_footer()); @@ -167,7 +167,7 @@ TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); std::shared_ptr out_col_props_1 = - props->column_properties(column_path_1); + props->column_encryption_properties(column_path_1); ASSERT_EQ(column_path_1->ToDotString(), out_col_props_1->column_path()->ToDotString()); ASSERT_EQ(true, out_col_props_1->is_encrypted()); @@ -176,7 +176,7 @@ TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { ASSERT_EQ("kc1", out_col_props_1->key_metadata()); std::shared_ptr out_col_props_2 = - props->column_properties(column_path_2); + props->column_encryption_properties(column_path_2); ASSERT_EQ(column_path_2->ToDotString(), out_col_props_2->column_path()->ToDotString()); ASSERT_EQ(true, out_col_props_2->is_encrypted()); @@ -189,7 +189,7 @@ TEST(TestEncryptionProperties, EncryptTwoColumnsNotFooter) { std::shared_ptr column_path_3 = parquet::schema::ColumnPath::FromDotString("column_3"); std::shared_ptr out_col_props_3 = - props->column_properties(column_path_3); + props->column_encryption_properties(column_path_3); ASSERT_EQ(NULLPTR, out_col_props_3); } @@ -209,7 +209,7 @@ TEST(TestEncryptionProperties, UseAadPrefix) { TEST(TestEncryptionProperties, UseAadPrefixNotStoreInFile) { FileEncryptionProperties::Builder builder(kFooterEncryptionKey); builder.aad_prefix(kFileName); - builder.disable_store_aad_prefix_storage(); + builder.disable_aad_prefix_storage(); std::shared_ptr props = builder.build(); ASSERT_EQ("", props->algorithm().aad.aad_prefix); @@ -280,7 +280,7 @@ TEST(TestDecryptionProperties, UsingExplicitFooterAndColumnKeys) { parquet::FileDecryptionProperties::Builder builder; builder.footer_key(kFooterEncryptionKey); - builder.column_properties(decryption_cols); + builder.column_keys(decryption_cols); std::shared_ptr props = builder.build(); ASSERT_EQ(kFooterEncryptionKey, props->footer_key()); diff --git a/cpp/src/parquet/encryption-read-configurations-test.cc b/cpp/src/parquet/encryption-read-configurations-test.cc index f520403071f9..225912f70665 100644 --- a/cpp/src/parquet/encryption-read-configurations-test.cc +++ b/cpp/src/parquet/encryption-read-configurations-test.cc @@ -158,11 +158,12 @@ class TestDecryptionConfiguration parquet::FileDecryptionProperties::Builder file_decryption_builder_3; vector_of_decryption_configurations_.push_back( file_decryption_builder_3.footer_key(kFooterEncryptionKey_) - ->column_properties(decryption_cols) + ->column_keys(decryption_cols) ->build()); - // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext columns. - vector_of_decryption_configurations_.push_back(NULL); + // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext + // columns. + vector_of_decryption_configurations_.push_back(NULL); } void DecryptFile(std::string file, int decryption_config_num) { @@ -170,9 +171,9 @@ class TestDecryptionConfiguration parquet::ReaderProperties reader_properties = parquet::default_reader_properties(); // if we get decryption_config_num = x then it means the actual number is x+1 // and since we want decryption_config_num=4 we set the condition to 3 - if(decryption_config_num != 3){ - reader_properties.file_decryption_properties( - vector_of_decryption_configurations_[decryption_config_num]->DeepClone()); + if (decryption_config_num != 3) { + reader_properties.file_decryption_properties( + vector_of_decryption_configurations_[decryption_config_num]->DeepClone()); } auto file_reader = @@ -296,49 +297,48 @@ class TestDecryptionConfiguration i++; } - if(decryption_config_num != 3){ - - // Get the Column Reader for the Float column - column_reader = row_group_reader->Column(4); - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (float_reader->HasNext()) { - float value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - float expected_value = static_cast(i) * 1.1f; - ASSERT_EQ(value, expected_value); - i++; - } - - // Get the Column Reader for the Double column - column_reader = row_group_reader->Column(5); - parquet::DoubleReader* double_reader = - static_cast(column_reader.get()); - // Read all the rows in the column - i = 0; - while (double_reader->HasNext()) { - double value; - // Read one value at a time. The number of rows read is returned. values_read - // contains the number of non-null rows - rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - // Ensure only one value is read - ASSERT_EQ(rows_read, 1); - // There are no NULL values in the rows written - ASSERT_EQ(values_read, 1); - // Verify the value written - double expected_value = i * 1.1111111; - ASSERT_EQ(value, expected_value); - i++; - } + if (decryption_config_num != 3) { + // Get the Column Reader for the Float column + column_reader = row_group_reader->Column(4); + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (float_reader->HasNext()) { + float value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + float expected_value = static_cast(i) * 1.1f; + ASSERT_EQ(value, expected_value); + i++; + } + + // Get the Column Reader for the Double column + column_reader = row_group_reader->Column(5); + parquet::DoubleReader* double_reader = + static_cast(column_reader.get()); + // Read all the rows in the column + i = 0; + while (double_reader->HasNext()) { + double value; + // Read one value at a time. The number of rows read is returned. values_read + // contains the number of non-null rows + rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + // Ensure only one value is read + ASSERT_EQ(rows_read, 1); + // There are no NULL values in the rows written + ASSERT_EQ(values_read, 1); + // Verify the value written + double expected_value = i * 1.1111111; + ASSERT_EQ(value, expected_value); + i++; + } } // Get the Column Reader for the ByteArray column @@ -420,8 +420,8 @@ class TestDecryptionConfiguration } // decryption config 4 can only work when the encryption configuration is 3 - if(decryption_config_num == 4 && encryption_config_num != 3){ - return; + if (decryption_config_num == 4 && encryption_config_num != 3) { + return; } EXPECT_NO_THROW(DecryptFile(file_name, decryption_config_num - 1)); } diff --git a/cpp/src/parquet/encryption-write-configurations-test.cc b/cpp/src/parquet/encryption-write-configurations-test.cc index 93e28df507dd..cd7ae413317b 100644 --- a/cpp/src/parquet/encryption-write-configurations-test.cc +++ b/cpp/src/parquet/encryption-write-configurations-test.cc @@ -274,7 +274,7 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndTheFooter) { kFooterEncryptionKey_); this->EncryptFile(file_encryption_builder_2.footer_key_metadata("kf") - ->column_properties(encryption_cols2) + ->encrypted_columns(encryption_cols2) ->build(), "tmp_encrypt_columns_and_footer.parquet.encrypted"); } @@ -300,7 +300,7 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsWithPlaintextFooter) { kFooterEncryptionKey_); this->EncryptFile(file_encryption_builder_3.footer_key_metadata("kf") - ->column_properties(encryption_cols3) + ->encrypted_columns(encryption_cols3) ->set_plaintext_footer() ->build(), "tmp_encrypt_columns_plaintext_footer.parquet.encrypted"); @@ -326,7 +326,7 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterWithAadPrefix) { kFooterEncryptionKey_); this->EncryptFile(file_encryption_builder_4.footer_key_metadata("kf") - ->column_properties(encryption_cols4) + ->encrypted_columns(encryption_cols4) ->aad_prefix(kFileName_) ->build(), "tmp_encrypt_columns_and_footer_aad.parquet.encrypted"); @@ -353,10 +353,10 @@ TEST_F(TestEncryptionConfiguration, kFooterEncryptionKey_); this->EncryptFile( - file_encryption_builder_5.column_properties(encryption_cols5) + file_encryption_builder_5.encrypted_columns(encryption_cols5) ->footer_key_metadata("kf") ->aad_prefix(kFileName_) - ->disable_store_aad_prefix_storage() + ->disable_aad_prefix_storage() ->build(), "tmp_encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted"); } @@ -382,7 +382,7 @@ TEST_F(TestEncryptionConfiguration, EncryptTwoColumnsAndFooterUseAES_GCM_CTR) { EXPECT_NO_THROW( this->EncryptFile(file_encryption_builder_6.footer_key_metadata("kf") - ->column_properties(encryption_cols6) + ->encrypted_columns(encryption_cols6) ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) ->build(), "tmp_encrypt_columns_and_footer_ctr.parquet.encrypted")); diff --git a/cpp/src/parquet/encryption.cc b/cpp/src/parquet/encryption.cc index fd94b34a7087..5cd031f5d3d8 100644 --- a/cpp/src/parquet/encryption.cc +++ b/cpp/src/parquet/encryption.cc @@ -80,34 +80,35 @@ ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id return this; } -FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_properties( - const ColumnPathToDecryptionPropertiesMap& column_properties) { - if (column_properties.size() == 0) return this; +FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys( + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) { + if (column_decryption_properties.size() == 0) return this; - if (column_properties_.size() != 0) + if (column_decryption_properties_.size() != 0) throw ParquetException("Column properties already set"); - for (const auto& element : column_properties) { + for (const auto& element : column_decryption_properties) { if (element.second->is_utilized()) { throw ParquetException("Column properties utilized in another file"); } element.second->set_utilized(); } - column_properties_ = column_properties; + column_decryption_properties_ = column_decryption_properties; return this; } void FileDecryptionProperties::WipeOutDecryptionKeys() { footer_key_.clear(); - for (const auto& element : column_properties_) { + for (const auto& element : column_decryption_properties_) { element.second->WipeOutDecryptionKey(); } } bool FileDecryptionProperties::is_utilized() { - if (footer_key_.empty() && column_properties_.size() == 0 && aad_prefix_.empty()) + if (footer_key_.empty() && column_decryption_properties_.size() == 0 && + aad_prefix_.empty()) return false; return utilized_; @@ -116,17 +117,18 @@ bool FileDecryptionProperties::is_utilized() { std::shared_ptr FileDecryptionProperties::DeepClone( std::string new_aad_prefix) { std::string footer_key_copy = footer_key_; - ColumnPathToDecryptionPropertiesMap column_properties_map_copy; + ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy; - for (const auto& element : column_properties_) { - column_properties_map_copy.insert( + for (const auto& element : column_decryption_properties_) { + column_decryption_properties_map_copy.insert( {element.second->column_path(), element.second->DeepClone()}); } if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; return std::shared_ptr(new FileDecryptionProperties( footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix, - aad_prefix_verifier_, column_properties_map_copy, plaintext_files_allowed_)); + aad_prefix_verifier_, column_decryption_properties_map_copy, + plaintext_files_allowed_)); } FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key( @@ -198,26 +200,26 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key return this; } -FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::column_properties( - const ColumnPathToEncryptionPropertiesMap& column_properties) { - if (column_properties.size() == 0) return this; +FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns( + const ColumnPathToEncryptionPropertiesMap& encrypted_columns) { + if (encrypted_columns.size() == 0) return this; - if (column_properties_.size() != 0) + if (encrypted_columns_.size() != 0) throw ParquetException("Column properties already set"); - for (const auto& element : column_properties) { + for (const auto& element : encrypted_columns) { if (element.second->is_utilized()) { throw ParquetException("Column properties utilized in another file"); } element.second->set_utilized(); } - column_properties_ = column_properties; + encrypted_columns_ = encrypted_columns; return this; } void FileEncryptionProperties::WipeOutEncryptionKeys() { footer_key_.clear(); - for (const auto& element : column_properties_) { + for (const auto& element : encrypted_columns_) { element.second->WipeOutEncryptionKey(); } } @@ -225,17 +227,17 @@ void FileEncryptionProperties::WipeOutEncryptionKeys() { std::shared_ptr FileEncryptionProperties::DeepClone( std::string new_aad_prefix) { std::string footer_key_copy = footer_key_; - ColumnPathToEncryptionPropertiesMap column_properties_map_copy; + ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy; - for (const auto& element : column_properties_) { - column_properties_map_copy.insert( + for (const auto& element : encrypted_columns_) { + encrypted_columns_map_copy.insert( {element.second->column_path(), element.second->DeepClone()}); } if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_; return std::shared_ptr(new FileEncryptionProperties( algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_, - new_aad_prefix, store_aad_prefix_in_file_, column_properties_map_copy)); + new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy)); } FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix( @@ -249,7 +251,7 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix } FileEncryptionProperties::Builder* -FileEncryptionProperties::Builder::disable_store_aad_prefix_storage() { +FileEncryptionProperties::Builder::disable_aad_prefix_storage() { DCHECK(!aad_prefix_.empty()); store_aad_prefix_in_file_ = false; @@ -300,8 +302,9 @@ ColumnDecryptionProperties::ColumnDecryptionProperties( const std::string& FileDecryptionProperties::column_key( const std::shared_ptr& column_path) { - if (column_properties_.find(column_path) != column_properties_.end()) { - auto column_prop = column_properties_[column_path]; + if (column_decryption_properties_.find(column_path) != + column_decryption_properties_.end()) { + auto column_prop = column_decryption_properties_[column_path]; if (column_prop != nullptr) { return column_prop->key(); } @@ -314,10 +317,10 @@ FileDecryptionProperties::FileDecryptionProperties( const std::shared_ptr& key_retriever, bool check_plaintext_footer_integrity, const std::string& aad_prefix, std::shared_ptr aad_prefix_verifier, - const ColumnPathToDecryptionPropertiesMap& column_properties, + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties, bool plaintext_files_allowed) { DCHECK(!footer_key.empty() || NULLPTR != key_retriever || - 0 != column_properties.size()); + 0 != column_decryption_properties.size()); if (!footer_key.empty()) { DCHECK(footer_key.length() == 16 || footer_key.length() == 24 || @@ -331,7 +334,7 @@ FileDecryptionProperties::FileDecryptionProperties( check_plaintext_footer_integrity_ = check_plaintext_footer_integrity; key_retriever_ = key_retriever; aad_prefix_ = aad_prefix; - column_properties_ = column_properties; + column_decryption_properties_ = column_decryption_properties; plaintext_files_allowed_ = plaintext_files_allowed; utilized_ = false; } @@ -352,15 +355,16 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key return footer_key_metadata(key_id); } -std::shared_ptr FileEncryptionProperties::column_properties( +std::shared_ptr +FileEncryptionProperties::column_encryption_properties( const std::shared_ptr& column_path) { - if (column_properties_.size() == 0) { + if (encrypted_columns_.size() == 0) { auto builder = std::shared_ptr( new ColumnEncryptionProperties::Builder(column_path)); return builder->build(); } - if (column_properties_.find(column_path) != column_properties_.end()) { - return column_properties_[column_path]; + if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) { + return encrypted_columns_[column_path]; } return nullptr; @@ -370,13 +374,13 @@ FileEncryptionProperties::FileEncryptionProperties( ParquetCipher::type cipher, const std::string& footer_key, const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, - const ColumnPathToEncryptionPropertiesMap& column_properties) + const ColumnPathToEncryptionPropertiesMap& encrypted_columns) : footer_key_(footer_key), footer_key_metadata_(footer_key_metadata), encrypted_footer_(encrypted_footer), aad_prefix_(aad_prefix), store_aad_prefix_in_file_(store_aad_prefix_in_file), - column_properties_(column_properties) { + encrypted_columns_(encrypted_columns) { // file encryption properties object can be used for writing only one file. // Upon completion of file writing, the encryption keys in the properties will be wiped // out (set to 0 in memory). diff --git a/cpp/src/parquet/encryption.h b/cpp/src/parquet/encryption.h index 4b0eb8a46b76..96fa7e100ffa 100644 --- a/cpp/src/parquet/encryption.h +++ b/cpp/src/parquet/encryption.h @@ -100,9 +100,8 @@ class PARQUET_EXPORT ColumnEncryptionProperties { class PARQUET_EXPORT Builder { public: /// Convenience builder for regular (not nested) columns. - explicit Builder(const std::string& name) { - Builder(schema::ColumnPath::FromDotString(name), true); - } + explicit Builder(const std::string& name) + : Builder(schema::ColumnPath::FromDotString(name), true) {} /// Convenience builder for encrypted columns. explicit Builder(const std::shared_ptr& path) @@ -282,8 +281,8 @@ class PARQUET_EXPORT FileDecryptionProperties { /// invocation of the retriever callback. /// If an explicit key is available for a footer or a column, /// its key metadata will be ignored. - Builder* column_properties( - const ColumnPathToDecryptionPropertiesMap& column_properties); + Builder* column_keys( + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties); /// Set a key retriever callback. Its also possible to /// set explicit footer or column keys on this file property object. @@ -327,14 +326,14 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr build() { return std::shared_ptr(new FileDecryptionProperties( footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_, - aad_prefix_verifier_, column_properties_, plaintext_files_allowed_)); + aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_)); } private: std::string footer_key_; std::string aad_prefix_; std::shared_ptr aad_prefix_verifier_; - ColumnPathToDecryptionPropertiesMap column_properties_; + ColumnPathToDecryptionPropertiesMap column_decryption_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; @@ -382,20 +381,20 @@ class PARQUET_EXPORT FileDecryptionProperties { std::shared_ptr aad_prefix_verifier_; const std::string empty_string_ = ""; - ColumnPathToDecryptionPropertiesMap column_properties_; + ColumnPathToDecryptionPropertiesMap column_decryption_properties_; std::shared_ptr key_retriever_; bool check_plaintext_footer_integrity_; bool plaintext_files_allowed_; bool utilized_; - FileDecryptionProperties(const std::string& footer_key, - const std::shared_ptr& key_retriever, - bool check_plaintext_footer_integrity, - const std::string& aad_prefix, - std::shared_ptr aad_prefix_verifier, - const ColumnPathToDecryptionPropertiesMap& column_properties, - bool plaintext_files_allowed); + FileDecryptionProperties( + const std::string& footer_key, + const std::shared_ptr& key_retriever, + bool check_plaintext_footer_integrity, const std::string& aad_prefix, + std::shared_ptr aad_prefix_verifier, + const ColumnPathToDecryptionPropertiesMap& column_decryption_properties, + bool plaintext_files_allowed); }; class PARQUET_EXPORT FileEncryptionProperties { @@ -436,18 +435,18 @@ class PARQUET_EXPORT FileEncryptionProperties { /// Skip storing AAD Prefix in file. /// If not called, and if AAD Prefix is set, it will be stored. - Builder* disable_store_aad_prefix_storage(); + Builder* disable_aad_prefix_storage(); /// Set the list of encrypted columns and their properties (keys etc). /// If not called, all columns will be encrypted with the footer key. /// If called, the file columns not in the list will be left unencrypted. - Builder* column_properties( - const ColumnPathToEncryptionPropertiesMap& column_properties); + Builder* encrypted_columns( + const ColumnPathToEncryptionPropertiesMap& encrypted_columns); std::shared_ptr build() { return std::shared_ptr(new FileEncryptionProperties( parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_, - aad_prefix_, store_aad_prefix_in_file_, column_properties_)); + aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_)); } private: @@ -458,7 +457,7 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool store_aad_prefix_in_file_; - ColumnPathToEncryptionPropertiesMap column_properties_; + ColumnPathToEncryptionPropertiesMap encrypted_columns_; }; bool encrypted_footer() const { return encrypted_footer_; } @@ -470,7 +469,7 @@ class PARQUET_EXPORT FileEncryptionProperties { const std::string& file_aad() const { return file_aad_; } - std::shared_ptr column_properties( + std::shared_ptr column_encryption_properties( const std::shared_ptr& column_path); bool is_utilized() { return utilized_; } @@ -491,7 +490,7 @@ class PARQUET_EXPORT FileEncryptionProperties { /// aadPrefix will be cloned too) std::shared_ptr DeepClone(std::string new_aad_prefix = ""); - ColumnPathToEncryptionPropertiesMap column_properties() { return column_properties_; } + ColumnPathToEncryptionPropertiesMap encrypted_columns() { return encrypted_columns_; } private: EncryptionAlgorithm algorithm_; @@ -502,12 +501,12 @@ class PARQUET_EXPORT FileEncryptionProperties { std::string aad_prefix_; bool utilized_; bool store_aad_prefix_in_file_; - ColumnPathToEncryptionPropertiesMap column_properties_; + ColumnPathToEncryptionPropertiesMap encrypted_columns_; FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key, const std::string& footer_key_metadata, bool encrypted_footer, const std::string& aad_prefix, bool store_aad_prefix_in_file, - const ColumnPathToEncryptionPropertiesMap& column_properties); + const ColumnPathToEncryptionPropertiesMap& encrypted_columns); }; } // namespace parquet diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 6d73bfb01481..16d5e77c783d 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -87,7 +87,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { properties_(properties), total_bytes_written_(0), closed_(false), - row_group_ordinal_ (row_group_ordinal), + row_group_ordinal_(row_group_ordinal), next_column_index_(0), num_rows_(0), buffered_row_group_(buffered_row_group), @@ -393,16 +393,16 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } else { // Check that all columns in columnEncryptionProperties exist in the schema. - auto columnEncryptionProperties = file_encryption_properties->column_properties(); + auto encrypted_columns = file_encryption_properties->encrypted_columns(); // if columnEncryptionProperties is empty, every column in file schema will be // encrypted with footer key. - if (columnEncryptionProperties.size() != 0) { + if (encrypted_columns.size() != 0) { std::vector> column_path_vec; // First, save all column paths in schema. for (int i = 0; i < num_columns(); i++) column_path_vec.push_back(schema_.Column(i)->path()); // Check if column exists in schema. - for (const auto& elem : columnEncryptionProperties) { + for (const auto& elem : encrypted_columns) { auto it = std::find_if(column_path_vec.begin(), column_path_vec.end(), [&](std::shared_ptr const& p) { return (p->ToDotString() == elem.first->ToDotString()); diff --git a/cpp/src/parquet/internal_file_encryptor.cc b/cpp/src/parquet/internal_file_encryptor.cc index 63a84557c9d7..9c9073fa2c5c 100644 --- a/cpp/src/parquet/internal_file_encryptor.cc +++ b/cpp/src/parquet/internal_file_encryptor.cc @@ -108,7 +108,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor( return column_data_map_.at(column_path); } } - auto column_prop = properties_->column_properties(column_path); + auto column_prop = properties_->column_encryption_properties(column_path); if (column_prop == NULLPTR) { return nullptr; } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 74bb39885a4a..dd20cbeadead 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -423,7 +423,7 @@ class PARQUET_EXPORT WriterProperties { std::shared_ptr column_encryption_properties( const std::shared_ptr& path) const { if (file_encryption_properties_) { - return file_encryption_properties_->column_properties(path); + return file_encryption_properties_->column_encryption_properties(path); } else { return NULLPTR; } diff --git a/cpp/src/parquet/test-encryption-util.cc b/cpp/src/parquet/test-encryption-util.cc deleted file mode 100644 index a1cc914205be..000000000000 --- a/cpp/src/parquet/test-encryption-util.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module defines an abstract interface for iterating through pages in a -// Parquet column chunk within a row group. It could be extended in the future -// to iterate through all data pages in all chunks in a file. - -#include "parquet/test-encryption-util.h" - -#include "parquet/test-util.h" - -namespace parquet { -namespace test { - -std::string data_file(const char* file) { - std::string dir_string(test::get_data_dir()); - std::stringstream ss; - ss << dir_string << "/" << file; - return ss.str(); -} - -} // namespace test -} // namespace parquet diff --git a/cpp/src/parquet/test-encryption-util.h b/cpp/src/parquet/test-encryption-util.h index 547e9f69a84f..202f40a1176f 100644 --- a/cpp/src/parquet/test-encryption-util.h +++ b/cpp/src/parquet/test-encryption-util.h @@ -39,12 +39,11 @@ #include "parquet/column_writer.h" #include "parquet/encoding.h" #include "parquet/platform.h" +#include "parquet/test-util.h" namespace parquet { namespace test { -std::string data_file(const char* file); - using parquet::ConvertedType; using parquet::Repetition; using parquet::Type; @@ -59,5 +58,12 @@ const char kColumnEncryptionKey1[] = "1234567890123450"; const char kColumnEncryptionKey2[] = "1234567890123451"; const char kFileName[] = "tester"; +inline std::string data_file(const char* file) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} + } // namespace test } // namespace parquet From cbc3e0e7d70c57128755e4c2b2e407ea9aa9c2e0 Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Wed, 21 Aug 2019 16:29:39 +0700 Subject: [PATCH 144/201] rename test file using underscore --- cpp/src/parquet/CMakeLists.txt | 8 ++++---- ...n-properties-test.cc => encryption_properties_test.cc} | 0 ...ons-test.cc => encryption_read_configurations_test.cc} | 4 ++-- ...ns-test.cc => encryption_write_configurations_test.cc} | 4 ++-- .../{test-encryption-util.h => test_encryption_util.h} | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) rename cpp/src/parquet/{encryption-properties-test.cc => encryption_properties_test.cc} (100%) rename cpp/src/parquet/{encryption-read-configurations-test.cc => encryption_read_configurations_test.cc} (99%) rename cpp/src/parquet/{encryption-write-configurations-test.cc => encryption_write_configurations_test.cc} (99%) rename cpp/src/parquet/{test-encryption-util.h => test_encryption_util.h} (98%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e76fd07484a0..74fa9e875b45 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -329,10 +329,10 @@ add_parquet_test(arrow-test if(ARROW_USE_OPENSSL) add_parquet_test(encryption-test SOURCES - encryption-write-configurations-test.cc - encryption-read-configurations-test.cc - encryption-properties-test.cc - test-util.cc) + encryption_write_configurations_test.cc + encryption_read_configurations_test.cc + encryption_properties_test.cc + test_util.cc) endif() # Those tests need to use static linking as they access thrift-generated diff --git a/cpp/src/parquet/encryption-properties-test.cc b/cpp/src/parquet/encryption_properties_test.cc similarity index 100% rename from cpp/src/parquet/encryption-properties-test.cc rename to cpp/src/parquet/encryption_properties_test.cc diff --git a/cpp/src/parquet/encryption-read-configurations-test.cc b/cpp/src/parquet/encryption_read_configurations_test.cc similarity index 99% rename from cpp/src/parquet/encryption-read-configurations-test.cc rename to cpp/src/parquet/encryption_read_configurations_test.cc index 225912f70665..53dae6a9fcca 100644 --- a/cpp/src/parquet/encryption-read-configurations-test.cc +++ b/cpp/src/parquet/encryption_read_configurations_test.cc @@ -27,8 +27,8 @@ #include "parquet/file_reader.h" #include "parquet/file_writer.h" #include "parquet/platform.h" -#include "parquet/test-encryption-util.h" -#include "parquet/test-util.h" +#include "parquet/test_encryption_util.h" +#include "parquet/test_util.h" /* * This file contains a unit-test for reading encrypted Parquet files with diff --git a/cpp/src/parquet/encryption-write-configurations-test.cc b/cpp/src/parquet/encryption_write_configurations_test.cc similarity index 99% rename from cpp/src/parquet/encryption-write-configurations-test.cc rename to cpp/src/parquet/encryption_write_configurations_test.cc index cd7ae413317b..e15fd38f146a 100644 --- a/cpp/src/parquet/encryption-write-configurations-test.cc +++ b/cpp/src/parquet/encryption_write_configurations_test.cc @@ -26,8 +26,8 @@ #include "parquet/file_reader.h" #include "parquet/file_writer.h" #include "parquet/platform.h" -#include "parquet/test-encryption-util.h" -#include "parquet/test-util.h" +#include "parquet/test_encryption_util.h" +#include "parquet/test_util.h" /* * This file contains unit-tests for writing encrypted Parquet files with diff --git a/cpp/src/parquet/test-encryption-util.h b/cpp/src/parquet/test_encryption_util.h similarity index 98% rename from cpp/src/parquet/test-encryption-util.h rename to cpp/src/parquet/test_encryption_util.h index 202f40a1176f..e430246eb272 100644 --- a/cpp/src/parquet/test-encryption-util.h +++ b/cpp/src/parquet/test_encryption_util.h @@ -39,7 +39,7 @@ #include "parquet/column_writer.h" #include "parquet/encoding.h" #include "parquet/platform.h" -#include "parquet/test-util.h" +#include "parquet/test_util.h" namespace parquet { namespace test { From 1f6479e21a5232804290e2b3c5deeeebbe73fb07 Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Wed, 21 Aug 2019 22:30:50 +0700 Subject: [PATCH 145/201] fix make lint --- .../encryption-reader-writer-all-crypto-options.cc | 2 -- cpp/src/parquet/metadata.cc | 1 - 2 files changed, 3 deletions(-) diff --git a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc index 710c3c05829a..4dc798da2483 100644 --- a/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc +++ b/cpp/examples/parquet/low-level-api/encryption-reader-writer-all-crypto-options.cc @@ -106,8 +106,6 @@ using parquet::Type; using parquet::schema::GroupNode; using parquet::schema::PrimitiveNode; -constexpr int FIXED_LENGTH = 10; - void PrintDecryptionConfiguration(int configuration); // Check that the decryption result is as expected. void CheckResult(std::string file, int example_id, std::string exception_msg); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ed8b205e7c28..0a42ec663493 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -24,7 +24,6 @@ #include "arrow/util/logging.h" #include -#include // IWYU pragma: keep #include "parquet/encryption_internal.h" #include "parquet/exception.h" #include "parquet/internal_file_decryptor.h" From 487b3299bf4cd15736d859a2ebb4818bf8eff7ac Mon Sep 17 00:00:00 2001 From: thamht4190 Date: Sat, 24 Aug 2019 10:38:50 +0700 Subject: [PATCH 146/201] fix a bad merge in r/configure.win --- r/configure.win | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/r/configure.win b/r/configure.win index c22cb08822e2..44dbb4c5f7c3 100644 --- a/r/configure.win +++ b/r/configure.win @@ -31,12 +31,6 @@ VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //) # If binary not found, script exits nonzero if [ $? -ne 0 ]; then echo "Arrow C++ library was not found" -else - # Set the right flags to point to and enable arrow/parquet - RWINLIB="../windows/arrow-${VERSION}" - OPENSSL_LIBS="-lcrypto -lcrypt32" - PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" - PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz ${OPENSSL_LIBS} -lws2_32" fi # Set the right flags to point to and enable arrow/parquet @@ -44,9 +38,10 @@ fi # R version, e.g. if the R build is a patch release, so find what the dir is # actually called: RWINLIB="../windows/$(ls windows/ | grep ^arrow-)" +OPENSSL_LIBS="-lcrypto -lcrypt32" PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" -PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lws2_32" +PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz ${OPENSSL_LIBS} -lws2_32" echo "*** Writing Makevars.win" sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars.win From 861ef2cff90041bb407e89eccc5b8ca4662adb8c Mon Sep 17 00:00:00 2001 From: Ha Thi Tham Date: Mon, 16 Sep 2019 10:16:43 +0700 Subject: [PATCH 147/201] add PARQUET_EXPORT to Encryptor, Decryptor --- cpp/src/parquet/internal_file_decryptor.h | 2 +- cpp/src/parquet/internal_file_encryptor.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/internal_file_decryptor.h b/cpp/src/parquet/internal_file_decryptor.h index 942f156feb85..0545bb3e180d 100644 --- a/cpp/src/parquet/internal_file_decryptor.h +++ b/cpp/src/parquet/internal_file_decryptor.h @@ -34,7 +34,7 @@ class AesEncryptor; class FileDecryptionProperties; -class Decryptor { +class PARQUET_EXPORT Decryptor { public: Decryptor(encryption::AesDecryptor* decryptor, const std::string& key, const std::string& file_aad, const std::string& aad, diff --git a/cpp/src/parquet/internal_file_encryptor.h b/cpp/src/parquet/internal_file_encryptor.h index 7d2ce7f4f128..0d0d9a5446a2 100644 --- a/cpp/src/parquet/internal_file_encryptor.h +++ b/cpp/src/parquet/internal_file_encryptor.h @@ -35,7 +35,7 @@ class AesEncryptor; class FileEncryptionProperties; class ColumnEncryptionProperties; -class Encryptor { +class PARQUET_EXPORT Encryptor { public: Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key, const std::string& file_aad, const std::string& aad, From b6f6b2bb08f220b5dc83517de1d0d4bc9152563a Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 23 Sep 2019 14:11:55 -0400 Subject: [PATCH 148/201] fix errors when encryption is disabled --- cpp/src/parquet/encryption_internal_nossl.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encryption_internal_nossl.cc b/cpp/src/parquet/encryption_internal_nossl.cc index 0a33f8509233..2fd611482052 100644 --- a/cpp/src/parquet/encryption_internal_nossl.cc +++ b/cpp/src/parquet/encryption_internal_nossl.cc @@ -30,9 +30,9 @@ class AesEncryptor::AesEncryptorImpl {}; AesEncryptor::~AesEncryptor() {} -int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, - uint8_t* nonce, uint8_t* encrypted_footer) { +int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, + const uint8_t* nonce, uint8_t* encrypted_footer) { ThrowOpenSSLRequiredException(); return -1; } @@ -44,8 +44,8 @@ int AesEncryptor::CiphertextSizeDelta() { return -1; } -int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { +int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext) { ThrowOpenSSLRequiredException(); return -1; } @@ -56,8 +56,8 @@ AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadat class AesDecryptor::AesDecryptorImpl {}; -int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* ciphertext) { +int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, + int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext) { ThrowOpenSSLRequiredException(); return -1; } From bdded5ec61617ce254ce56f3ec635d96ee3396b6 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 23 Sep 2019 14:20:55 -0400 Subject: [PATCH 149/201] do not build encryption support by default when Parquet is built --- .travis.yml | 3 +++ ci/travis_before_script_cpp.sh | 4 ++++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 9 ++------- cpp/examples/parquet/CMakeLists.txt | 4 ++-- cpp/src/parquet/CMakeLists.txt | 4 ++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1261f3877a57..b52e1c499d07 100644 --- a/.travis.yml +++ b/.travis.yml @@ -74,6 +74,7 @@ matrix: - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_PARQUET_ENCRYPTION=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_S3=1 - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 @@ -135,6 +136,7 @@ matrix: - ARROW_TRAVIS_FLIGHT=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_PARQUET_ENCRYPTION=1 # TODO(ARROW-4763): llvm and llvmdev packages are in conflict: # https://github.com/conda-forge/llvmdev-feedstock/issues/60 # - ARROW_TRAVIS_GANDIVA=1 @@ -299,6 +301,7 @@ matrix: dist: xenial env: - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_PARQUET_ENCRYPTION=1 - ARROW_TRAVIS_USE_SYSTEM=1 before_install: # Have to copy-paste this here because of how R's build steps work diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 9c6caffd48d8..6a92d80ac3b7 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -133,6 +133,10 @@ if [ "$ARROW_TRAVIS_PARQUET" == "1" ]; then -DPARQUET_BUILD_EXECUTABLES=ON" fi +if [ "$ARROW_TRAVIS_PARQUET_ENCRYPTION" == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DPARQUET_REQUIRE_ENCRYPTION=ON" +fi + if [ "$ARROW_TRAVIS_GANDIVA" == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" if [ "$ARROW_TRAVIS_GANDIVA_JAVA" == "1" ]; then diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index caf307f07f99..09a3ddd0f05d 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -916,7 +916,6 @@ if(ARROW_WITH_BROTLI) include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) endif() -set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION AND NOT ARROW_PARQUET) set(PARQUET_REQUIRE_ENCRYPTION OFF) endif() @@ -929,16 +928,12 @@ if(BREW_BIN AND NOT OPENSSL_ROOT_DIR) set(OPENSSL_ROOT_DIR ${OPENSSL_BREW_PREFIX}) endif() endif() + +set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT OR ARROW_S3) # This must work find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) set(ARROW_USE_OPENSSL ON) -elseif(ARROW_PARQUET) - # Enable Parquet encryption if OpenSSL is there, but don't fail if it's not - find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} QUIET) - if(OPENSSL_FOUND) - set(ARROW_USE_OPENSSL ON) - endif() endif() if(ARROW_USE_OPENSSL) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 7a45ab239f05..86d8d870e1a0 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -21,7 +21,7 @@ add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) target_include_directories(parquet-low-level-example PRIVATE low-level-api/) target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) -if (ARROW_USE_OPENSSL) +if (PARQUET_REQUIRE_ENCRYPTION) add_executable(parquet-encryption-example low-level-api/encryption-reader-writer.cc) add_executable(parquet-encryption-example-all-crypto-options low-level-api/encryption-reader-writer-all-crypto-options.cc) target_include_directories(parquet-encryption-example PRIVATE low-level-api/) @@ -46,7 +46,7 @@ add_dependencies(parquet parquet-low-level-example2 parquet-arrow-example) -if (ARROW_USE_OPENSSL) +if (PARQUET_REQUIRE_ENCRYPTION) add_dependencies(parquet parquet-encryption-example parquet-encryption-example-all-crypto-options) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 3e3e9a37af6f..0659767a578d 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -171,7 +171,7 @@ set(PARQUET_SRCS statistics.cc types.cc) -if(ARROW_USE_OPENSSL) +if(PARQUET_REQUIRE_ENCRYPTION) set(PARQUET_SRCS ${PARQUET_SRCS} encryption_internal.cc) else() set(PARQUET_SRCS ${PARQUET_SRCS} encryption_internal_nossl.cc) @@ -315,7 +315,7 @@ add_parquet_test(arrow-test arrow/arrow_schema_test.cc test_util.cc) -if(ARROW_USE_OPENSSL) +if(PARQUET_REQUIRE_ENCRYPTION) add_parquet_test(encryption-test SOURCES encryption_write_configurations_test.cc From ef0583baf0bec5abe9c7b06faacdff40b73c614c Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 23 Sep 2019 14:24:37 -0400 Subject: [PATCH 150/201] specify PARQUET_REQUIRE_ENCRYPTION in doc, build, CI, etc. --- ci/appveyor-cpp-build-mingw.bat | 1 + ci/cpp-msvc-build-main.bat | 1 + ci/travis_script_python.sh | 1 + dev/gen_apidocs/create_documents.sh | 1 + dev/lint/run_iwyu.sh | 1 + dev/release/verify-release-candidate.sh | 1 + docs/source/developers/cpp.rst | 4 +++- 7 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/appveyor-cpp-build-mingw.bat b/ci/appveyor-cpp-build-mingw.bat index 0fccf2a09113..341154c70cfd 100644 --- a/ci/appveyor-cpp-build-mingw.bat +++ b/ci/appveyor-cpp-build-mingw.bat @@ -46,6 +46,7 @@ cmake ^ -DARROW_BUILD_TESTS=ON ^ -DARROW_PACKAGE_PREFIX=%MINGW_PREFIX% ^ -DARROW_PARQUET=ON ^ + -DPARQUET_REQUIRE_ENCRYPTION=ON ^ -DARROW_PYTHON=ON ^ -DARROW_USE_GLOG=OFF ^ -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index b088e2eec763..b0a87739a504 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -74,6 +74,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_S3=%ARROW_S3% ^ -DARROW_MIMALLOC=ON ^ -DARROW_PARQUET=ON ^ + -DPARQUET_REQUIRE_ENCRYPTION=ON ^ -DPARQUET_BUILD_EXECUTABLES=ON ^ -DARROW_PYTHON=ON ^ .. || exit /B diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 202c24f0a58c..46e6ab6bc2a3 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -131,6 +131,7 @@ cmake -GNinja \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_OPTIONAL_INSTALL=ON \ -DARROW_PARQUET=on \ + -DPARQUET_REQUIRE_ENCRYPTION=on \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index 142e6e0b377b..9330075cb43b 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -59,6 +59,7 @@ cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DARROW_PYTHON=ON \ -DARROW_PLASMA=ON \ -DARROW_PARQUET=ON \ + -DPARQUET_REQUIRE_ENCRYPTION=ON \ -DARROW_ORC=ON \ -DARROW_BUILD_TESTS=OFF \ -GNinja \ diff --git a/dev/lint/run_iwyu.sh b/dev/lint/run_iwyu.sh index 46517cfa91ca..a4040b08a10d 100755 --- a/dev/lint/run_iwyu.sh +++ b/dev/lint/run_iwyu.sh @@ -26,6 +26,7 @@ cmake -GNinja \ -DARROW_FLIGHT=ON \ -DARROW_GANDIVA=ON \ -DARROW_PARQUET=ON \ + -DPARQUET_REQUIRE_ENCRYPTION=ON \ -DARROW_PYTHON=ON \ -DCMAKE_CXX_FLAGS='-D_GLIBCXX_USE_CXX11_ABI=0' \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 0acb56e4d8a3..f9e0ce27811b 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -263,6 +263,7 @@ ${ARROW_CMAKE_OPTIONS:-} -DARROW_PYTHON=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON +-DPARQUET_REQUIRE_ENCRYPTION=ON -DARROW_BOOST_USE_SHARED=ON -DCMAKE_BUILD_TYPE=release -DARROW_BUILD_TESTS=ON diff --git a/docs/source/developers/cpp.rst b/docs/source/developers/cpp.rst index 2384ca44be28..a0a4ae116e64 100644 --- a/docs/source/developers/cpp.rst +++ b/docs/source/developers/cpp.rst @@ -951,7 +951,9 @@ Apache Parquet Development ========================== To build the C++ libraries for Apache Parquet, add the flag -``-DARROW_PARQUET=ON`` when invoking CMake. The Parquet libraries and unit tests +``-DARROW_PARQUET=ON`` when invoking CMake. +To build Apache Parquet with encryption support, add the flag +``-DPARQUET_REQUIRE_ENCRYPTION=ON`` when invoking CMake. The Parquet libraries and unit tests can be built with the ``parquet`` make target: .. code-block:: shell From 8e03784149af5893ecfc54cd9f559cb0855d8b31 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 24 Sep 2019 14:37:18 -0400 Subject: [PATCH 151/201] fix format --- cpp/src/parquet/encryption_internal_nossl.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encryption_internal_nossl.cc b/cpp/src/parquet/encryption_internal_nossl.cc index 2fd611482052..9e43c9c555e8 100644 --- a/cpp/src/parquet/encryption_internal_nossl.cc +++ b/cpp/src/parquet/encryption_internal_nossl.cc @@ -30,9 +30,10 @@ class AesEncryptor::AesEncryptorImpl {}; AesEncryptor::~AesEncryptor() {} -int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key, - int key_len, const uint8_t* aad, int aad_len, - const uint8_t* nonce, uint8_t* encrypted_footer) { +int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len, + const uint8_t* key, int key_len, const uint8_t* aad, + int aad_len, const uint8_t* nonce, + uint8_t* encrypted_footer) { ThrowOpenSSLRequiredException(); return -1; } @@ -45,7 +46,8 @@ int AesEncryptor::CiphertextSizeDelta() { } int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, - int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext) { + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { ThrowOpenSSLRequiredException(); return -1; } @@ -57,7 +59,8 @@ AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadat class AesDecryptor::AesDecryptorImpl {}; int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key, - int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext) { + int key_len, const uint8_t* aad, int aad_len, + uint8_t* ciphertext) { ThrowOpenSSLRequiredException(); return -1; } From f6472cec7682060ac53b76c9fc854bf9b26cff7c Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 24 Sep 2019 17:56:06 -0400 Subject: [PATCH 152/201] refactor column reader --- cpp/src/parquet/column_reader.cc | 131 ++++++++++++------------------- cpp/src/parquet/column_reader.h | 20 +++-- cpp/src/parquet/file_reader.cc | 17 ++-- 3 files changed, 72 insertions(+), 96 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index f117390872aa..623f2740aeda 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -119,28 +119,16 @@ class SerializedPageReader : public PageReader { public: SerializedPageReader(const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, - ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) + ::arrow::MemoryPool* pool, const CryptoContext* crypto_ctx) : stream_(stream), decompression_buffer_(AllocateBuffer(pool, 0)), - column_has_dictionary_(false), - first_page_(true), - row_group_ordinal_(-1), - column_ordinal_(-1), - page_ordinal_(-1), + page_ordinal_(0), seen_num_rows_(0), total_num_rows_(total_num_rows), - decryption_buffer_(AllocateBuffer(pool, 0)), - meta_decryptor_(NULLPTR), - data_decryptor_(NULLPTR) { - if (ctx != NULLPTR) { - column_has_dictionary_ = ctx->column_has_dictionary; - row_group_ordinal_ = ctx->row_group_ordinal; - column_ordinal_ = ctx->column_ordinal; - meta_decryptor_ = ctx->meta_decryptor; - data_decryptor_ = ctx->data_decryptor; - if (data_decryptor_ != NULLPTR || meta_decryptor_ != NULLPTR) { - InitDecryption(); - } + decryption_buffer_(AllocateBuffer(pool, 0)) { + if (crypto_ctx != NULLPTR) { + crypto_ctx_ = *crypto_ctx; + InitDecryption(); } max_page_header_size_ = kDefaultMaxPageHeaderSize; decompressor_ = GetCodec(codec); @@ -152,8 +140,7 @@ class SerializedPageReader : public PageReader { void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; } private: - void UpdateDecryption(const std::shared_ptr& decryptor, - bool current_page_is_dictionary, int8_t module_type, + void UpdateDecryption(const std::shared_ptr& decryptor, int8_t module_type, const std::string& pageAAD); void InitDecryption(); @@ -169,25 +156,15 @@ class SerializedPageReader : public PageReader { // The fields below are used for calculation of AAD (additional authenticated data) // suffix which is part of the Parquet Modular Encryption. - // The AAD suffix for a parquet module is built internally by Parquet, by direct - // concatenation the different parts of the module, which includes amongst other - // its row group ordinal, column ordinal and page ordinal. + // The AAD suffix for a parquet module is built internally by + // concatenating different parts some of which include + // the row group ordinal, column ordinal and page ordinal. // Please refer to the encryption specification for more details: // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data - // To calculate the AAD suffix of an encrypted module, the exact type of the module - // should be known. The following two fields indicate whether the page is data or - // dictionary page. - - // Indicates whether the column has dictionary page. - bool column_has_dictionary_; - // If the column has dictionary page and the page currently processed in the first - // one then it is a dictionary page. - bool first_page_; - // The ordinal fields below are used for AAD suffix calculation. - int16_t row_group_ordinal_; - int16_t column_ordinal_; - int16_t page_ordinal_; + // The ordinal fields in the context below are used for AAD suffix calculation. + CryptoContext crypto_ctx_; + int16_t page_ordinal_; // page ordinal does not count the dictionary page // Maximum allowed page size uint32_t max_page_header_size_; @@ -206,35 +183,34 @@ class SerializedPageReader : public PageReader { std::string data_page_headerAAD_; // Encryption std::shared_ptr decryption_buffer_; - std::shared_ptr meta_decryptor_; - std::shared_ptr data_decryptor_; }; void SerializedPageReader::InitDecryption() { // Prepare the AAD for quick update later. - if (data_decryptor_ != NULLPTR) { - DCHECK(!data_decryptor_->file_aad().empty()); - data_pageAAD_ = encryption::CreateModuleAad( - data_decryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_, - column_ordinal_, static_cast(-1)); + if (crypto_ctx_.data_decryptor != NULLPTR) { + DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty()); + data_pageAAD_ = + encryption::CreateModuleAad(crypto_ctx_.data_decryptor->file_aad(), + encryption::kDataPage, crypto_ctx_.row_group_ordinal, + crypto_ctx_.column_ordinal, static_cast(-1)); } - if (meta_decryptor_ != NULLPTR) { - DCHECK(!meta_decryptor_->file_aad().empty()); + if (crypto_ctx_.meta_decryptor != NULLPTR) { + DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty()); data_page_headerAAD_ = encryption::CreateModuleAad( - meta_decryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_, - column_ordinal_, static_cast(-1)); + crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader, + crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, + static_cast(-1)); } } void SerializedPageReader::UpdateDecryption(const std::shared_ptr& decryptor, - bool current_page_is_dictionary, int8_t module_type, const std::string& pageAAD) { DCHECK(decryptor != NULLPTR); - if (current_page_is_dictionary) { - std::string aad = encryption::CreateModuleAad(decryptor->file_aad(), module_type, - row_group_ordinal_, column_ordinal_, - static_cast(-1)); + if (crypto_ctx_.start_decrypt_with_dictionary_page) { + std::string aad = encryption::CreateModuleAad( + decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal, + crypto_ctx_.column_ordinal, static_cast(-1)); decryptor->UpdateAad(aad); } else { encryption::QuickUpdatePageAad(pageAAD, page_ordinal_); @@ -245,17 +221,6 @@ void SerializedPageReader::UpdateDecryption(const std::shared_ptr& de std::shared_ptr SerializedPageReader::NextPage() { // Loop here because there may be unhandled page types that we skip until // finding a page that we do know what to do with - bool current_page_is_dictionary = false; - if (column_has_dictionary_) { - if (first_page_) { - current_page_is_dictionary = true; - first_page_ = false; - } else { - page_ordinal_++; - } - } else { - page_ordinal_++; - } while (seen_num_rows_ < total_num_rows_) { uint32_t header_size = 0; @@ -268,18 +233,19 @@ std::shared_ptr SerializedPageReader::NextPage() { string_view buffer; PARQUET_THROW_NOT_OK(stream_->Peek(allowed_page_size, &buffer)); if (buffer.size() == 0) { - return std::shared_ptr(nullptr); + return std::shared_ptr(NULLPTR); } // This gets used, then set by DeserializeThriftMsg header_size = static_cast(buffer.size()); try { - if (meta_decryptor_ != NULLPTR) { - UpdateDecryption(meta_decryptor_, current_page_is_dictionary, - encryption::kDictionaryPageHeader, data_page_headerAAD_); + if (crypto_ctx_.meta_decryptor != NULLPTR) { + UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader, + data_page_headerAAD_); } DeserializeThriftMsg(reinterpret_cast(buffer.data()), - &header_size, ¤t_page_header_, meta_decryptor_); + &header_size, ¤t_page_header_, + crypto_ctx_.meta_decryptor); break; } catch (std::exception& e) { // Failed to deserialize. Double the allowed page header size and try again @@ -297,9 +263,9 @@ std::shared_ptr SerializedPageReader::NextPage() { int compressed_len = current_page_header_.compressed_page_size; int uncompressed_len = current_page_header_.uncompressed_page_size; - if (data_decryptor_ != NULLPTR) { - UpdateDecryption(data_decryptor_, current_page_is_dictionary, - encryption::kDictionaryPage, data_pageAAD_); + if (crypto_ctx_.data_decryptor != NULLPTR) { + UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage, + data_pageAAD_); } // Read the compressed data page. std::shared_ptr page_buffer; @@ -312,16 +278,16 @@ std::shared_ptr SerializedPageReader::NextPage() { } // Decrypt it if we need to - if (data_decryptor_ != nullptr) { + if (crypto_ctx_.data_decryptor != NULLPTR) { PARQUET_THROW_NOT_OK(decryption_buffer_->Resize( - compressed_len - data_decryptor_->CiphertextSizeDelta())); - compressed_len = data_decryptor_->Decrypt(page_buffer->data(), compressed_len, - decryption_buffer_->mutable_data()); + compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta())); + compressed_len = crypto_ctx_.data_decryptor->Decrypt( + page_buffer->data(), compressed_len, decryption_buffer_->mutable_data()); page_buffer = decryption_buffer_; } // Uncompress it if we need to - if (decompressor_ != nullptr) { + if (decompressor_ != NULLPTR) { // Grow the uncompressed buffer if we need to. if (uncompressed_len > static_cast(decompression_buffer_->size())) { PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len, false)); @@ -333,6 +299,7 @@ std::shared_ptr SerializedPageReader::NextPage() { } if (current_page_header_.type == format::PageType::DICTIONARY_PAGE) { + crypto_ctx_.start_decrypt_with_dictionary_page = false; const format::DictionaryPageHeader& dict_header = current_page_header_.dictionary_page_header; @@ -342,6 +309,7 @@ std::shared_ptr SerializedPageReader::NextPage() { FromThrift(dict_header.encoding), is_sorted); } else if (current_page_header_.type == format::PageType::DATA_PAGE) { + page_ordinal_++; const format::DataPageHeader& header = current_page_header_.data_page_header; EncodedStatistics page_statistics; @@ -368,6 +336,7 @@ std::shared_ptr SerializedPageReader::NextPage() { FromThrift(header.definition_level_encoding), FromThrift(header.repetition_level_encoding), page_statistics); } else if (current_page_header_.type == format::PageType::DATA_PAGE_V2) { + page_ordinal_++; const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2; bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false; @@ -383,12 +352,12 @@ std::shared_ptr SerializedPageReader::NextPage() { continue; } } - return std::shared_ptr(nullptr); + return std::shared_ptr(NULLPTR); } std::unique_ptr PageReader::Open( const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, ::arrow::MemoryPool* pool, struct PageReaderContext* ctx) { + Compression::type codec, ::arrow::MemoryPool* pool, const CryptoContext* ctx) { return std::unique_ptr( new SerializedPageReader(stream, total_num_rows, codec, pool, ctx)); } @@ -414,7 +383,7 @@ class ColumnReaderImplBase { num_buffered_values_(0), num_decoded_values_(0), pool_(pool), - current_decoder_(nullptr), + current_decoder_(NULLPTR), current_encoding_(Encoding::UNKNOWN) {} virtual ~ColumnReaderImplBase() = default; @@ -598,7 +567,7 @@ class ColumnReaderImplBase { auto it = decoders_.find(static_cast(encoding)); if (it != decoders_.end()) { - DCHECK(it->second.get() != nullptr); + DCHECK(it->second.get() != NULLPTR); if (encoding == Encoding::RLE_DICTIONARY) { DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY); } @@ -893,7 +862,7 @@ std::shared_ptr ColumnReader::Make(const ColumnDescriptor* descr, ParquetException::NYI("type reader not implemented"); } // Unreachable code, but supress compiler warning - return std::shared_ptr(nullptr); + return std::shared_ptr(NULLPTR); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index aa2e3a3c069f..bf5270be8a41 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -73,11 +73,19 @@ class PARQUET_EXPORT LevelDecoder { std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; }; -struct PageReaderContext { - bool column_has_dictionary; - int16_t row_group_ordinal; - int16_t column_ordinal; - +struct CryptoContext { + CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal, + std::shared_ptr meta, std::shared_ptr data) + : start_decrypt_with_dictionary_page(start_with_dictionary_page), + row_group_ordinal(rg_ordinal), + column_ordinal(col_ordinal), + meta_decryptor(meta), + data_decryptor(data) {} + CryptoContext() {} + + bool start_decrypt_with_dictionary_page = false; + int16_t row_group_ordinal = -1; + int16_t column_ordinal = -1; std::shared_ptr meta_decryptor; std::shared_ptr data_decryptor; }; @@ -91,7 +99,7 @@ class PARQUET_EXPORT PageReader { static std::unique_ptr Open( const std::shared_ptr& stream, int64_t total_num_rows, Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - struct PageReaderContext* ctx = NULLPTR); + const CryptoContext* ctx = nullptr); // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr // containing new Page otherwise diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 61813e5861ae..b637649e1884 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -128,11 +128,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::unique_ptr crypto_metadata = col->crypto_metadata(); - PageReaderContext ctx; // Column is encrypted only if crypto_metadata exists. if (!crypto_metadata) { - ctx = PageReaderContext{col->has_dictionary_page(), row_group_ordinal_, - static_cast(i), NULLPTR, NULLPTR}; + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), NULLPTR, NULLPTR); return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); } @@ -145,8 +144,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { if (crypto_metadata->encrypted_with_footer_key()) { meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta(); data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData(); - PageReaderContext ctx = {col->has_dictionary_page(), row_group_ordinal_, - static_cast(i), meta_decryptor, data_decryptor}; + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor); return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); } @@ -161,8 +160,8 @@ class SerializedRowGroup : public RowGroupReader::Contents { data_decryptor = file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata); - ctx = PageReaderContext{col->has_dictionary_page(), row_group_ordinal_, - static_cast(i), meta_decryptor, data_decryptor}; + CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_, + static_cast(i), meta_decryptor, data_decryptor); return PageReader::Open(stream, col->num_values(), col->compression(), properties_.memory_pool(), &ctx); } @@ -344,7 +343,7 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( } } auto file_decryption_properties = properties_.file_decryption_properties(); - if (file_decryption_properties == nullptr) { + if (file_decryption_properties == NULLPTR) { throw ParquetException( "No decryption properties are provided. Could not read " "encrypted footer metadata"); @@ -467,7 +466,7 @@ std::unique_ptr ParquetFileReader::Contents::Open( // Access private methods here, but otherwise unavailable SerializedFile* file = static_cast(result.get()); - if (metadata == nullptr) { + if (metadata == NULLPTR) { // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor file->ParseMetaData(); } else { From 1a39821da009a7186bc32aae36384f7713531e8a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 30 Sep 2019 13:47:53 +0200 Subject: [PATCH 153/201] ARROW-6610: [C++] Add cmake option to disable filesystem layer Closes #5515 from pitrou/ARROW-6610-cmake-filesystem-flag and squashes the following commits: b74741dde ARROW-6610: Add cmake option to disable filesystem layer Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/CMakeLists.txt | 6 +- cpp/cmake_modules/DefineOptions.cmake | 44 ++-- cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 +- cpp/examples/minimal_build/build.sh | 6 +- cpp/src/arrow/CMakeLists.txt | 234 ++++++++++---------- 5 files changed, 158 insertions(+), 139 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2ab620d8f6d7..ea1a5756587c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -268,10 +268,14 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) set(ARROW_JSON ON) endif() -if(ARROW_FLIGHT OR ARROW_PARQUET OR ARROW_BUILD_TESTS) +if(ARROW_CUDA OR ARROW_FLIGHT OR ARROW_PARQUET OR ARROW_BUILD_TESTS) set(ARROW_IPC ON) endif() +if(ARROW_DATASET) + set(ARROW_FILESYSTEM ON) +endif() + if(MSVC) # ORC doesn't build on windows set(ARROW_ORC OFF) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 3764f4824962..7a0158ef4c61 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -90,21 +90,19 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") #---------------------------------------------------------------------- set_option_category("Test and benchmark") - define_option(ARROW_BUILD_EXAMPLES "Build the Arrow examples, default OFF" OFF) + define_option(ARROW_BUILD_EXAMPLES "Build the Arrow examples" OFF) - define_option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests, default OFF" - OFF) - - define_option(ARROW_BUILD_INTEGRATION - "Build the Arrow integration test executables, default OFF" OFF) + define_option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests" OFF) - define_option(ARROW_BUILD_BENCHMARKS "Build the Arrow micro benchmarks, default OFF" + define_option(ARROW_BUILD_INTEGRATION "Build the Arrow integration test executables" OFF) + define_option(ARROW_BUILD_BENCHMARKS "Build the Arrow micro benchmarks" OFF) + # Reference benchmarks are used to compare to naive implementation, or # discover various hardware limits. define_option(ARROW_BUILD_BENCHMARKS_REFERENCE - "Build the Arrow micro reference benchmarks, default OFF." OFF) + "Build the Arrow micro reference benchmarks" OFF) define_option_string(ARROW_TEST_LINKAGE "Linkage of Arrow libraries with unit tests executables." @@ -141,47 +139,49 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") #---------------------------------------------------------------------- set_option_category("Project component") + define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" ON) + define_option(ARROW_COMPUTE "Build the Arrow Compute Modules" ON) + define_option(ARROW_CUDA "Build the Arrow CUDA extensions (requires CUDA toolkit)" OFF) + define_option(ARROW_DATASET "Build the Arrow Dataset Modules" ON) + define_option(ARROW_FILESYSTEM "Build the Arrow Filesystem Layer" ON) + define_option(ARROW_FLIGHT "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" OFF) define_option(ARROW_GANDIVA "Build the Gandiva libraries" OFF) - define_option(ARROW_PARQUET "Build the Parquet libraries" OFF) - - define_option(ARROW_IPC "Build the Arrow IPC extensions" ON) + define_option(ARROW_HDFS "Build the Arrow HDFS bridge" ON) - define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" ON) + define_option(ARROW_HIVESERVER2 "Build the HiveServer2 client and Arrow adapter" OFF) - define_option(ARROW_CUDA "Build the Arrow CUDA extensions (requires CUDA toolkit)" OFF) + define_option(ARROW_IPC "Build the Arrow IPC extensions" ON) - define_option(ARROW_ORC "Build the Arrow ORC adapter" OFF) + define_option(ARROW_JEMALLOC "Build the Arrow jemalloc-based allocator" ON) define_option(ARROW_JNI "Build the Arrow JNI lib" OFF) - define_option(ARROW_TENSORFLOW "Build Arrow with TensorFlow support enabled" OFF) - - define_option(ARROW_JEMALLOC "Build the Arrow jemalloc-based allocator" ON) + define_option(ARROW_JSON "Build Arrow with JSON support (requires RapidJSON)" ON) define_option(ARROW_MIMALLOC "Build the Arrow mimalloc-based allocator" OFF) - define_option(ARROW_HDFS "Build the Arrow HDFS bridge" ON) - - define_option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) + define_option(ARROW_PARQUET "Build the Parquet libraries" OFF) - define_option(ARROW_HIVESERVER2 "Build the HiveServer2 client and Arrow adapter" OFF) + define_option(ARROW_ORC "Build the Arrow ORC adapter" OFF) define_option(ARROW_PLASMA "Build the plasma object store along with Arrow" OFF) define_option(ARROW_PLASMA_JAVA_CLIENT "Build the plasma object store java client" OFF) - define_option(ARROW_JSON "Build Arrow with JSON support (requires RapidJSON)" ON) + define_option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF) + define_option(ARROW_TENSORFLOW "Build Arrow with TensorFlow support enabled" OFF) + #---------------------------------------------------------------------- set_option_category("Thirdparty toolchain") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e8e4fa6f7bc7..4b707a2092a8 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +include(ProcessorCount) +processorcount(NPROC) + add_custom_target(rapidjson) add_custom_target(toolchain) add_custom_target(toolchain-benchmarks) @@ -500,7 +503,7 @@ if(${CMAKE_GENERATOR} MATCHES "Makefiles") set(MAKE_BUILD_ARGS "") else() # limit the maximum number of jobs for ninja - set(MAKE_BUILD_ARGS "-j4") + set(MAKE_BUILD_ARGS "-j${NPROC}") endif() # ---------------------------------------------------------------------- @@ -560,7 +563,7 @@ macro(build_boost) else() set(BOOST_BUILD_VARIANT "release") endif() - set(BOOST_BUILD_COMMAND "./b2" "link=${BOOST_BUILD_LINK}" + set(BOOST_BUILD_COMMAND "./b2" "-j${NPROC}" "link=${BOOST_BUILD_LINK}" "variant=${BOOST_BUILD_VARIANT}") if(MSVC) string(REGEX diff --git a/cpp/examples/minimal_build/build.sh b/cpp/examples/minimal_build/build.sh index 068f4898ba0d..a8c1552eebe9 100755 --- a/cpp/examples/minimal_build/build.sh +++ b/cpp/examples/minimal_build/build.sh @@ -22,6 +22,7 @@ # docker run --rm -t -i -v $PWD:/io -v $ARROW_ROOT:/arrow arrow_cpp_minimal /io/build.sh BUILD_DIR=/build +NPROC=$(nproc) mkdir $BUILD_DIR pushd $BUILD_DIR @@ -30,6 +31,8 @@ cmake /arrow/cpp -DBOOST_SOURCE=BUNDLED \ -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_COMPUTE=OFF \ -DARROW_DATASET=OFF \ + -DARROW_FILESYSTEM=OFF \ + -DARROW_HDFS=OFF \ -DARROW_JEMALLOC=OFF \ -DARROW_JSON=OFF \ -DARROW_USE_GLOG=OFF \ @@ -41,6 +44,7 @@ cmake /arrow/cpp -DBOOST_SOURCE=BUNDLED \ -DARROW_WITH_BROTLI=OFF \ -DARROW_BUILD_UTILITIES=OFF -make -j8 +make -j$NPROC +make install popd diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 91ff44369982..bf5a77e58857 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -114,12 +114,6 @@ set(ARROW_SRCS csv/options.cc csv/parser.cc csv/reader.cc - filesystem/filesystem.cc - filesystem/localfs.cc - filesystem/mockfs.cc - filesystem/path_tree.cc - filesystem/path_util.cc - filesystem/util_internal.cc io/buffered.cc io/compressed.cc io/file.cc @@ -149,6 +143,13 @@ set(ARROW_SRCS vendored/base64.cpp vendored/datetime/tz.cpp) +set(ARROW_TESTING_SRCS + io/test_common.cc + ipc/test_common.cc + testing/gtest_util.cc + testing/random.cc + testing/generator.cc) + # Add dependencies for third-party allocators. # If possible we only want memory_pool.cc to wait for allocators to finish building, # but that only works with Ninja @@ -173,26 +174,6 @@ endif() unset(_allocator_dependencies) -if(ARROW_JSON) - add_subdirectory(json) - set(ARROW_SRCS - ${ARROW_SRCS} - json/options.cc - json/chunked_builder.cc - json/chunker.cc - json/converter.cc - json/parser.cc - json/reader.cc) -endif() - -if(ARROW_S3) - set(ARROW_SRCS ${ARROW_SRCS} filesystem/s3fs.cc) -endif() - -if(ARROW_WITH_URIPARSER) - set(ARROW_SRCS ${ARROW_SRCS} util/uri.cc) -endif() - if("${COMPILER_FAMILY}" STREQUAL "clang") set_property(SOURCE util/io_util.cc APPEND_STRING @@ -202,94 +183,149 @@ endif() # Unconditionally install testing headers that are also useful for Arrow consumers. add_subdirectory(testing) +# +# Add the various components +# + +add_subdirectory(array) +add_subdirectory(csv) +add_subdirectory(io) +add_subdirectory(util) +add_subdirectory(vendored) + if(ARROW_COMPUTE) add_subdirectory(compute) - set(ARROW_SRCS - ${ARROW_SRCS} - compute/context.cc - compute/expression.cc - compute/logical_type.cc - compute/operation.cc - compute/kernels/aggregate.cc - compute/kernels/boolean.cc - compute/kernels/cast.cc - compute/kernels/compare.cc - compute/kernels/count.cc - compute/kernels/hash.cc - compute/kernels/filter.cc - compute/kernels/mean.cc - compute/kernels/sort_to_indices.cc - compute/kernels/sum.cc - compute/kernels/take.cc - compute/kernels/isin.cc - compute/kernels/util_internal.cc - compute/operations/cast.cc - compute/operations/literal.cc) + list(APPEND ARROW_SRCS + compute/context.cc + compute/expression.cc + compute/logical_type.cc + compute/operation.cc + compute/kernels/aggregate.cc + compute/kernels/boolean.cc + compute/kernels/cast.cc + compute/kernels/compare.cc + compute/kernels/count.cc + compute/kernels/hash.cc + compute/kernels/filter.cc + compute/kernels/mean.cc + compute/kernels/sort_to_indices.cc + compute/kernels/sum.cc + compute/kernels/take.cc + compute/kernels/isin.cc + compute/kernels/util_internal.cc + compute/operations/cast.cc + compute/operations/literal.cc) endif() if(ARROW_CUDA) - # IPC extensions required to build the CUDA library - set(ARROW_IPC ON) add_subdirectory(gpu) endif() -if(ARROW_WITH_BROTLI) - add_definitions(-DARROW_WITH_BROTLI) - set(ARROW_SRCS util/compression_brotli.cc ${ARROW_SRCS}) +if(ARROW_DATASET) + add_subdirectory(dataset) endif() -if(ARROW_WITH_BZ2) - add_definitions(-DARROW_WITH_BZ2) - set(ARROW_SRCS util/compression_bz2.cc ${ARROW_SRCS}) +if(ARROW_FILESYSTEM) + add_subdirectory(filesystem) + + list(APPEND ARROW_SRCS + filesystem/filesystem.cc + filesystem/localfs.cc + filesystem/mockfs.cc + filesystem/path_tree.cc + filesystem/path_util.cc + filesystem/util_internal.cc) + + if(ARROW_S3) + list(APPEND ARROW_SRCS filesystem/s3fs.cc) + endif() + + list(APPEND ARROW_TESTING_SRCS filesystem/test_util.cc) endif() -if(ARROW_WITH_LZ4) - add_definitions(-DARROW_WITH_LZ4) - set(ARROW_SRCS util/compression_lz4.cc ${ARROW_SRCS}) +if(ARROW_FLIGHT) + add_subdirectory(flight) endif() -if(ARROW_WITH_SNAPPY) - add_definitions(-DARROW_WITH_SNAPPY) - set(ARROW_SRCS util/compression_snappy.cc ${ARROW_SRCS}) +if(ARROW_HIVESERVER2) + add_subdirectory(dbi/hiveserver2) endif() -if(ARROW_WITH_ZLIB) - add_definitions(-DARROW_WITH_ZLIB) - set(ARROW_SRCS util/compression_zlib.cc ${ARROW_SRCS}) +if(ARROW_IPC) + add_subdirectory(ipc) + + list(APPEND ARROW_SRCS + ipc/dictionary.cc + ipc/feather.cc + ipc/message.cc + ipc/metadata_internal.cc + ipc/options.cc + ipc/reader.cc + ipc/writer.cc) + + if(ARROW_JSON) + list( + APPEND ARROW_SRCS ipc/json_integration.cc ipc/json_internal.cc ipc/json_simple.cc) + endif() + add_dependencies(arrow_dependencies metadata_fbs) endif() -if(ARROW_WITH_ZSTD) - add_definitions(-DARROW_WITH_ZSTD) - set(ARROW_SRCS util/compression_zstd.cc ${ARROW_SRCS}) +if(ARROW_JSON) + add_subdirectory(json) + list(APPEND ARROW_SRCS + json/options.cc + json/chunked_builder.cc + json/chunker.cc + json/converter.cc + json/parser.cc + json/reader.cc) endif() if(ARROW_ORC) add_subdirectory(adapters/orc) - set(ARROW_SRCS adapters/orc/adapter.cc adapters/orc/adapter_util.cc ${ARROW_SRCS}) + list(APPEND ARROW_SRCS adapters/orc/adapter.cc adapters/orc/adapter_util.cc) +endif() + +if(ARROW_PYTHON) + add_subdirectory(python) endif() if(ARROW_TENSORFLOW) add_subdirectory(adapters/tensorflow) endif() -if(ARROW_IPC) - add_subdirectory(ipc) +if(ARROW_WITH_URIPARSER) + list(APPEND ARROW_SRCS util/uri.cc) +endif() - set(ARROW_SRCS - ${ARROW_SRCS} - ipc/dictionary.cc - ipc/feather.cc - ipc/message.cc - ipc/metadata_internal.cc - ipc/options.cc - ipc/reader.cc - ipc/writer.cc) +if(ARROW_WITH_BROTLI) + add_definitions(-DARROW_WITH_BROTLI) + list(APPEND ARROW_SRCS util/compression_brotli.cc) +endif() - if(ARROW_JSON) - set(ARROW_SRCS ${ARROW_SRCS} ipc/json_integration.cc ipc/json_internal.cc - ipc/json_simple.cc) - endif() - add_dependencies(arrow_dependencies metadata_fbs) +if(ARROW_WITH_BZ2) + add_definitions(-DARROW_WITH_BZ2) + list(APPEND ARROW_SRCS util/compression_bz2.cc) +endif() + +if(ARROW_WITH_LZ4) + add_definitions(-DARROW_WITH_LZ4) + list(APPEND ARROW_SRCS util/compression_lz4.cc) +endif() + +if(ARROW_WITH_SNAPPY) + add_definitions(-DARROW_WITH_SNAPPY) + list(APPEND ARROW_SRCS util/compression_snappy.cc) +endif() + +if(ARROW_WITH_ZLIB) + add_definitions(-DARROW_WITH_ZLIB) + list(APPEND ARROW_SRCS util/compression_zlib.cc) +endif() + +if(ARROW_WITH_ZSTD) + add_definitions(-DARROW_WITH_ZSTD) + list(APPEND ARROW_SRCS util/compression_zstd.cc) endif() if(NOT APPLE AND NOT MSVC) @@ -333,12 +369,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS OR ARROW_BUILD_INTEGRATION) # that depend on gtest add_arrow_lib(arrow_testing SOURCES - io/test_common.cc - ipc/test_common.cc - filesystem/test_util.cc - testing/gtest_util.cc - testing/random.cc - testing/generator.cc + ${ARROW_TESTING_SRCS} OUTPUTS ARROW_TESTING_LIBRARIES DEPENDENCIES @@ -412,26 +443,3 @@ add_arrow_test(sparse_tensor_test) add_arrow_benchmark(builder_benchmark) add_arrow_benchmark(type_benchmark) - -add_subdirectory(array) -add_subdirectory(csv) -add_subdirectory(filesystem) -add_subdirectory(io) -add_subdirectory(util) -add_subdirectory(vendored) - -if(ARROW_DATASET) - add_subdirectory(dataset) -endif() - -if(ARROW_FLIGHT) - add_subdirectory(flight) -endif() - -if(ARROW_PYTHON) - add_subdirectory(python) -endif() - -if(ARROW_HIVESERVER2) - add_subdirectory(dbi/hiveserver2) -endif() From 58e1144434802f7fa358f078437be4de120d9c36 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Sep 2019 12:13:11 -0500 Subject: [PATCH 154/201] ARROW-6564: [Python] Do not require pandas for invoking ChunkedArray.__array__ Follow-up on https://github.com/apache/arrow/pull/5402, to apply the same fix for `ChunkedArray` (@wesm was a bit quick with merging :)) Closes #5497 from jorisvandenbossche/ARROW-6564-chunked-array-to-pandas and squashes the following commits: e6f5adc50 ARROW-6564: Do not require pandas for invoking ChunkedArray.__array__ Authored-by: Joris Van den Bossche Signed-off-by: Wes McKinney --- python/pyarrow/table.pxi | 19 ++++++++++++++++++- python/pyarrow/tests/test_table.py | 11 +++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 086c4f45011e..a2f9194231b5 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -173,7 +173,24 @@ cdef class ChunkedArray(_PandasConvertible): return result def __array__(self, dtype=None): - values = self.to_pandas().values + cdef: + PyObject* out + PandasOptions c_options + object values + + with nogil: + check_status(libarrow.ConvertChunkedArrayToPandas( + c_options, + self.sp_chunked_array, + self, &out)) + + # wrap_array_output uses pandas to convert to Categorical, here + # always convert to numpy array + values = PyObject_to_object(out) + + if isinstance(values, dict): + values = np.take(values['dictionary'], values['indices']) + if dtype is None: return values return values.astype(dtype) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 434ec8029c1e..da1f084cf20f 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -203,7 +203,10 @@ def test_chunked_array_to_pandas(): @pytest.mark.pandas +@pytest.mark.nopandas def test_chunked_array_asarray(): + # ensure this is tested both when pandas is present or not (ARROW-6564) + data = [ pa.array([0]), pa.array([1, 2, 3]) @@ -232,6 +235,14 @@ def test_chunked_array_asarray(): assert elements[2:] == [1., 2., 3.] assert np_arr.dtype == np.dtype('float64') + # DictionaryType data will be converted to dense numpy array + arr = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2, 0, 1]), pa.array(['a', 'b', 'c'])) + chunked_arr = pa.chunked_array([arr, arr]) + np_arr = np.asarray(chunked_arr) + assert np_arr.dtype == np.dtype('object') + assert np_arr.tolist() == ['a', 'b', 'c', 'a', 'b'] * 2 + def test_chunked_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), From 4aaa2114db2aa6c04416a65204345fdca0580224 Mon Sep 17 00:00:00 2001 From: Pasha Stetsenko Date: Mon, 30 Sep 2019 16:28:19 -0500 Subject: [PATCH 155/201] ARROW-6729: [C++] Prevent data copying in StlStringBuffer Closes #5517 from st-pasha/stl-buffer-nocopy and squashes the following commits: 6f26b4dea try to satisfy lint 2d4dda6c4 remove whitespace 85e91f5e4 Prevent data copying in StlStringBuffer Authored-by: Pasha Stetsenko Signed-off-by: Wes McKinney --- cpp/src/arrow/buffer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index bfbf2f8f2ccb..ca78922f2c27 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -79,7 +79,8 @@ Status Buffer::FromString(const std::string& data, std::shared_ptr* out) class StlStringBuffer : public Buffer { public: - explicit StlStringBuffer(std::string&& data) : Buffer(nullptr, 0), input_(data) { + explicit StlStringBuffer(std::string&& data) + : Buffer(nullptr, 0), input_(std::move(data)) { data_ = reinterpret_cast(input_.c_str()); size_ = static_cast(input_.size()); capacity_ = size_; From c95aaab6cab90656949e4443a290f071a1765936 Mon Sep 17 00:00:00 2001 From: Sebastien Binet Date: Mon, 30 Sep 2019 17:42:05 -0500 Subject: [PATCH 156/201] ARROW-6646: [Go] Write no IPC buffer metadata for NullType Closes #5547 from sbinet/issue-6646 and squashes the following commits: 5fdd6496d ARROW-6646: Write no IPC buffer metadata for NullType Authored-by: Sebastien Binet Signed-off-by: Wes McKinney --- go/arrow/ipc/file_reader.go | 6 ++---- go/arrow/ipc/writer.go | 13 +++++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 813f5838b724..2b08b01240cb 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -437,10 +437,8 @@ func (ctx *arrayLoaderContext) loadChild(dt arrow.DataType) array.Interface { } func (ctx *arrayLoaderContext) loadNull() array.Interface { - field, buffers := ctx.loadCommon(1) - buffers = append(buffers, ctx.buffer()) - - data := array.NewData(arrow.Null, int(field.Length()), buffers, nil, int(field.NullCount()), 0) + field := ctx.field() + data := array.NewData(arrow.Null, int(field.Length()), nil, nil, int(field.NullCount()), 0) defer data.Release() return array.MakeFromData(data) diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index 356159867124..4ed0ff0b1ee1 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -221,14 +221,19 @@ func (w *recordEncoder) visit(p *payload, arr array.Interface) error { case 0: p.body = append(p.body, nil) default: - data := arr.Data() - bitmap := newTruncatedBitmap(w.mem, int64(data.Offset()), int64(data.Len()), data.Buffers()[0]) - p.body = append(p.body, bitmap) + switch arr.DataType().ID() { + case arrow.NULL: + // Null type has no validity bitmap + default: + data := arr.Data() + bitmap := newTruncatedBitmap(w.mem, int64(data.Offset()), int64(data.Len()), data.Buffers()[0]) + p.body = append(p.body, bitmap) + } } switch dtype := arr.DataType().(type) { case *arrow.NullType: - p.body = append(p.body, nil) + // ok. NullArrays are completely empty. case *arrow.BooleanType: var ( From 610deb7dc76cb4a5dfc3cd28e02963be97209109 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 30 Sep 2019 17:45:13 -0500 Subject: [PATCH 157/201] ARROW-6685: [C++] Ignore trailing slashes in S3FS Closes #5546 from pitrou/ARROW-6685-s3fs-trailing-slash and squashes the following commits: ae315aff8 ARROW-6685: Ignore trailing slashes in S3FS Authored-by: Antoine Pitrou Signed-off-by: Wes McKinney --- cpp/src/arrow/filesystem/filesystem_test.cc | 12 ++++++++++++ cpp/src/arrow/filesystem/path_util.cc | 2 +- cpp/src/arrow/filesystem/s3fs.cc | 11 ++++++----- cpp/src/arrow/filesystem/s3fs_test.cc | 21 +++++++++++++++++++++ 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc index c8eba30218de..c199a2dde2cb 100644 --- a/cpp/src/arrow/filesystem/filesystem_test.cc +++ b/cpp/src/arrow/filesystem/filesystem_test.cc @@ -155,6 +155,18 @@ TEST(PathUtil, EnsureTrailingSlash) { ASSERT_EQ("/abc/", EnsureTrailingSlash("/abc/")); } +TEST(PathUtil, RemoveTrailingSlash) { + ASSERT_EQ("", std::string(RemoveTrailingSlash(""))); + ASSERT_EQ("", std::string(RemoveTrailingSlash("/"))); + ASSERT_EQ("", std::string(RemoveTrailingSlash("//"))); + ASSERT_EQ("abc/def", std::string(RemoveTrailingSlash("abc/def"))); + ASSERT_EQ("abc/def", std::string(RemoveTrailingSlash("abc/def/"))); + ASSERT_EQ("abc/def", std::string(RemoveTrailingSlash("abc/def//"))); + ASSERT_EQ("/abc/def", std::string(RemoveTrailingSlash("/abc/def"))); + ASSERT_EQ("/abc/def", std::string(RemoveTrailingSlash("/abc/def/"))); + ASSERT_EQ("/abc/def", std::string(RemoveTrailingSlash("/abc/def//"))); +} + //////////////////////////////////////////////////////////////////////////// // Generic MockFileSystem tests diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index cfc59a6f8d44..2ba143e59b94 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -115,7 +115,7 @@ std::string EnsureTrailingSlash(const std::string& s) { } util::string_view RemoveTrailingSlash(util::string_view key) { - if (!key.empty() && key.back() == kSep) { + while (!key.empty() && key.back() == kSep) { key.remove_suffix(1); } return key; diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 136d12e20302..8053c223ade6 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -177,17 +177,18 @@ struct S3Path { std::vector key_parts; static Status FromString(const std::string& s, S3Path* out) { - auto first_sep = s.find_first_of(kSep); + const auto src = internal::RemoveTrailingSlash(s); + auto first_sep = src.find_first_of(kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { - *out = {s, s, "", {}}; + *out = {std::string(src), std::string(src), "", {}}; return Status::OK(); } - out->full_path = s; - out->bucket = s.substr(0, first_sep); - out->key = s.substr(first_sep + 1); + out->full_path = std::string(src); + out->bucket = std::string(src.substr(0, first_sep)); + out->key = std::string(src.substr(first_sep + 1)); out->key_parts = internal::SplitAbstractPath(out->key); return internal::ValidateAbstractPathParts(out->key_parts); } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index b19b18e01638..8564901ae400 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -375,6 +375,10 @@ TEST_F(TestS3FS, GetTargetStatsBucket) { AssertFileStats(fs_.get(), "bucket", FileType::Directory); AssertFileStats(fs_.get(), "empty-bucket", FileType::Directory); AssertFileStats(fs_.get(), "non-existent-bucket", FileType::NonExistent); + // Trailing slashes + AssertFileStats(fs_.get(), "bucket/", FileType::Directory); + AssertFileStats(fs_.get(), "empty-bucket/", FileType::Directory); + AssertFileStats(fs_.get(), "non-existent-bucket/", FileType::NonExistent); } TEST_F(TestS3FS, GetTargetStatsObject) { @@ -393,6 +397,12 @@ TEST_F(TestS3FS, GetTargetStatsObject) { AssertFileStats(fs_.get(), "bucket/emptyd", FileType::NonExistent); AssertFileStats(fs_.get(), "bucket/somed", FileType::NonExistent); AssertFileStats(fs_.get(), "non-existent-bucket/somed", FileType::NonExistent); + + // Trailing slashes + AssertFileStats(fs_.get(), "bucket/emptydir/", FileType::Directory, kNoSize); + AssertFileStats(fs_.get(), "bucket/somefile/", FileType::File, 9); + AssertFileStats(fs_.get(), "bucket/emptyd/", FileType::NonExistent); + AssertFileStats(fs_.get(), "non-existent-bucket/somed/", FileType::NonExistent); } TEST_F(TestS3FS, GetTargetStatsSelector) { @@ -447,6 +457,17 @@ TEST_F(TestS3FS, GetTargetStatsSelector) { ASSERT_OK(fs_->GetTargetStats(select, &stats)); ASSERT_EQ(stats.size(), 0); select.allow_non_existent = false; + + // Trailing slashes + select.base_dir = "empty-bucket/"; + ASSERT_OK(fs_->GetTargetStats(select, &stats)); + ASSERT_EQ(stats.size(), 0); + select.base_dir = "non-existent-bucket/"; + ASSERT_RAISES(IOError, fs_->GetTargetStats(select, &stats)); + select.base_dir = "bucket/"; + ASSERT_OK(fs_->GetTargetStats(select, &stats)); + SortStats(&stats); + ASSERT_EQ(stats.size(), 3); } TEST_F(TestS3FS, GetTargetStatsSelectorRecursive) { From df9fc54f1932cfaae156719757087af09a2186a1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 30 Sep 2019 17:46:00 -0500 Subject: [PATCH 158/201] ARROW-6740: [C++] Unmap MemoryMappedFile as soon as possible When a MemoryMappedFile has been closed and all exported buffers have been destroyed, unmap the memory region instead of waiting for the file object destruction. This allows deleting the underlying file on Windows. Closes #5551 from pitrou/ARROW-6740-munmap-timely and squashes the following commits: 9afab352e ARROW-6740: Unmap MemoryMappedFile as soon as possible Authored-by: Antoine Pitrou Signed-off-by: Wes McKinney --- cpp/src/arrow/io/file.cc | 133 ++++++++++++++++++++------------ cpp/src/arrow/io/file_test.cc | 2 +- python/pyarrow/tests/test_io.py | 15 ++++ 3 files changed, 98 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 2f3f5bac2e70..3a9306300d70 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -384,24 +384,47 @@ Status FileOutputStream::Write(const void* data, int64_t length) { int FileOutputStream::file_descriptor() const { return impl_->fd(); } // ---------------------------------------------------------------------- -// Implement MemoryMappedFile as a buffer subclass -// The class doesn't differentiate between size and capacity -class MemoryMappedFile::MemoryMap : public MutableBuffer { +// Implement MemoryMappedFile + +class MemoryMappedFile::MemoryMap + : public std::enable_shared_from_this { public: - MemoryMap() : MutableBuffer(nullptr, 0) {} + // An object representing the entire memory-mapped region. + // It can be sliced in order to return individual subregions, which + // will then keep the original region alive as long as necessary. + class Region : public MutableBuffer { + public: + Region(std::shared_ptr memory_map, uint8_t* data, + int64_t size) + : MutableBuffer(data, size) { + is_mutable_ = memory_map->writable(); + if (!is_mutable_) { + mutable_data_ = nullptr; + } + } - ~MemoryMap() { - ARROW_CHECK_OK(Close()); - if (mutable_data_ != nullptr) { - int result = munmap(mutable_data_, static_cast(map_len_)); - ARROW_CHECK_EQ(result, 0) << "munmap failed"; + ~Region() { + if (data_ != nullptr) { + int result = munmap(data(), static_cast(size_)); + ARROW_CHECK_EQ(result, 0) << "munmap failed"; + } } - } + + // For convenience + uint8_t* data() { return const_cast(data_); } + + void Detach() { data_ = nullptr; } + }; + + MemoryMap() : file_size_(0), map_len_(0) {} + + ~MemoryMap() { ARROW_CHECK_OK(Close()); } Status Close() { if (file_->is_open()) { - // NOTE: we don't unmap here, so that buffers exported through Read() - // remain valid until the MemoryMap object is destroyed + // Lose our reference to the MemoryMappedRegion, so that munmap() + // is called as soon as all buffer exports are released. + region_.reset(); return file_->Close(); } else { return Status::OK(); @@ -422,14 +445,10 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { constexpr bool truncate = false; constexpr bool write_only = false; RETURN_NOT_OK(file_->OpenWritable(path, truncate, append, write_only)); - - is_mutable_ = true; } else { prot_flags_ = PROT_READ; map_mode_ = MAP_PRIVATE; // Changes are not to be committed back to the file RETURN_NOT_OK(file_->OpenReadable(path)); - - is_mutable_ = false; } map_len_ = offset_ = 0; @@ -450,33 +469,38 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { if (!writable()) { return Status::IOError("Cannot resize a readonly memory map"); } - if (map_len_ != size_) { + if (map_len_ != file_size_) { return Status::IOError("Cannot resize a partial memory map"); } + if (region_.use_count() > 1) { + // There are buffer exports currently, the MemoryMapRemap() call + // would make the buffers invalid + return Status::IOError("Cannot resize memory map while there are active readers"); + } if (new_size == 0) { - if (mutable_data_ != nullptr) { - // just unmap the mmap and truncate the file to 0 size - if (munmap(mutable_data_, capacity_) != 0) { - return Status::IOError("Cannot unmap the file"); - } + if (map_len_ > 0) { + // Just unmap the mmap and truncate the file to 0 size + region_.reset(); RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), 0)); - data_ = mutable_data_ = nullptr; - map_len_ = offset_ = size_ = capacity_ = 0; + map_len_ = offset_ = file_size_ = 0; } position_ = 0; return Status::OK(); } - if (mutable_data_) { + if (map_len_ > 0) { void* result; - RETURN_NOT_OK(::arrow::internal::MemoryMapRemap(mutable_data_, size_, new_size, + auto data = region_->data(); + RETURN_NOT_OK(::arrow::internal::MemoryMapRemap(data, map_len_, new_size, file_->fd(), &result)); - map_len_ = size_ = capacity_ = new_size; + region_->Detach(); // avoid munmap() on destruction + region_ = std::make_shared(shared_from_this(), + static_cast(result), new_size); + map_len_ = file_size_ = new_size; offset_ = 0; - data_ = mutable_data_ = static_cast(result); - if (position_ > size_) { - position_ = size_; + if (position_ > map_len_) { + position_ = map_len_; } } else { DCHECK_EQ(position_, 0); @@ -487,9 +511,6 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { return Status::OK(); } - // map_len_ == size_ if memory mapping on the whole file - int64_t size() const { return map_len_; } - Status Seek(int64_t position) { if (position < 0) { return Status::Invalid("position is out of bounds"); @@ -498,11 +519,28 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { return Status::OK(); } + Status Slice(int64_t offset, int64_t length, std::shared_ptr* out) { + length = std::max(0, std::min(length, map_len_ - offset)); + + if (length > 0) { + DCHECK_NE(region_, nullptr); + *out = SliceBuffer(region_, offset, length); + } else { + *out = std::make_shared(nullptr, 0); + } + return Status::OK(); + } + + // map_len_ == file_size_ if memory mapping on the whole file + int64_t size() const { return map_len_; } + int64_t position() { return position_; } void advance(int64_t nbytes) { position_ = position_ + nbytes; } - uint8_t* head() { return mutable_data_ + position_; } + uint8_t* head() { return data() + position_; } + + uint8_t* data() { return region_ ? region_->data() : nullptr; } bool writable() { return file_->mode() != FileMode::READ; } @@ -518,10 +556,11 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { // Initialize the mmap and set size, capacity and the data pointers Status InitMMap(int64_t initial_size, bool resize_file = false, const int64_t offset = 0, const int64_t length = -1) { + DCHECK(!region_); + if (resize_file) { RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), initial_size)); } - DCHECK(data_ == nullptr && mutable_data_ == nullptr); size_t mmap_length = static_cast(initial_size); if (length > initial_size) { @@ -539,14 +578,19 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { } map_len_ = mmap_length; offset_ = offset; - size_ = capacity_ = initial_size; - data_ = mutable_data_ = static_cast(result); + region_ = std::make_shared(shared_from_this(), static_cast(result), + map_len_); + file_size_ = initial_size; return Status::OK(); } + std::unique_ptr file_; int prot_flags_; int map_mode_; + + std::shared_ptr region_; + int64_t file_size_; int64_t position_; int64_t offset_; int64_t map_len_; @@ -619,14 +663,7 @@ Status MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, auto guard_resize = memory_map_->writable() ? std::unique_lock(memory_map_->resize_lock()) : std::unique_lock(); - nbytes = std::max(0, std::min(nbytes, memory_map_->size() - position)); - - if (nbytes > 0) { - *out = SliceBuffer(memory_map_, position, nbytes); - } else { - *out = std::make_shared(nullptr, 0); - } - return Status::OK(); + return memory_map_->Slice(position, nbytes, out); } Status MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, @@ -697,12 +734,6 @@ Status MemoryMappedFile::Resize(int64_t new_size) { std::unique_lock write_guard(memory_map_->write_lock(), std::defer_lock); std::unique_lock resize_guard(memory_map_->resize_lock(), std::defer_lock); std::lock(write_guard, resize_guard); - // having both locks, we can check the number of times memory_map_ - // was borrwed (meaning number of reader still holding a ref to it + 1) - // and if it's greater than 1, we fail loudly - if (memory_map_.use_count() > 1) { - return Status::IOError("Cannot resize memory map while there are active readers"); - } RETURN_NOT_OK(memory_map_->Resize(new_size)); return Status::OK(); } diff --git a/cpp/src/arrow/io/file_test.cc b/cpp/src/arrow/io/file_test.cc index eed75add147c..ab12148b3837 100644 --- a/cpp/src/arrow/io/file_test.cc +++ b/cpp/src/arrow/io/file_test.cc @@ -636,7 +636,7 @@ class TestMemoryMappedFile : public ::testing::Test, public MemoryMapFixture { TEST_F(TestMemoryMappedFile, InvalidUsages) {} -TEST_F(TestMemoryMappedFile, ZeroSizeFlie) { +TEST_F(TestMemoryMappedFile, ZeroSizeFile) { std::string path = "io-memory-map-zero-size"; std::shared_ptr result; ASSERT_OK(InitMemoryMap(0, path, &result)); diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 76148e7d8b2d..55b183b7227e 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -839,6 +839,21 @@ def test_memory_map_large_seeks(): check_large_seeks(pa.memory_map) +def test_memory_map_close_remove(tmpdir): + # ARROW-6740: should be able to delete closed memory-mapped file (Windows) + path = os.path.join(str(tmpdir), guid()) + mmap = pa.create_memory_map(path, 4096) + mmap.close() + assert mmap.closed + os.remove(path) # Shouldn't fail + + +def test_memory_map_deref_remove(tmpdir): + path = os.path.join(str(tmpdir), guid()) + pa.create_memory_map(path, 4096) + os.remove(path) # Shouldn't fail + + def test_os_file_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') From 9e0e1a2f91f8711a61710eb18d38e5d251917742 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 30 Sep 2019 17:47:19 -0500 Subject: [PATCH 159/201] ARROW-6708: [C++] Fix hardcoded boost library names In the case where `find_package(Boost ...)` succeeded, just use whatever it gave us. Closes #5510 from pitrou/ARROW-6708-boost-library-static and squashes the following commits: da8d28a30 ARROW-6708: Fix hardcoded boost library names Authored-by: Antoine Pitrou Signed-off-by: Wes McKinney --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4b707a2092a8..ce495f177f8f 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -641,6 +641,10 @@ if(TARGET Boost::system) set(BOOST_SYSTEM_LIBRARY Boost::system) set(BOOST_FILESYSTEM_LIBRARY Boost::filesystem) set(BOOST_REGEX_LIBRARY Boost::regex) +elseif(BoostAlt_FOUND) + set(BOOST_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY}) + set(BOOST_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY}) + set(BOOST_REGEX_LIBRARY ${Boost_REGEX_LIBRARY}) else() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) @@ -648,8 +652,8 @@ else() endif() set(ARROW_BOOST_LIBS ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY}) -message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIR}) -message(STATUS "Boost libraries: " ${Boost_LIBRARIES}) +message(STATUS "Boost include dir: ${Boost_INCLUDE_DIR}") +message(STATUS "Boost libraries: ${ARROW_BOOST_LIBS}") include_directories(SYSTEM ${Boost_INCLUDE_DIR}) From 98d8a6d796f838ed55766a8318c83915eb05696e Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Mon, 30 Sep 2019 19:07:01 -0500 Subject: [PATCH 160/201] ARROW-6722: [Java] Provide a uniform way to get vector name Currently, the getName method is defined in BaseValueVector, as an abstract class. However, some vector does not extend the BaseValueVector, like StructVector, UnionVector, ZeroVector. In this issue, we move the method to ValueVector interface, the base interface for all vectors. This makes it easier to get a vector's name without checking its type. Closes #5525 from liyafan82/fly_0927_name and squashes the following commits: 8e3c2897a Provide a uniform way to get vector name Authored-by: liyafan82 Signed-off-by: Wes McKinney --- java/vector/src/main/codegen/templates/UnionVector.java | 5 +++++ .../main/java/org/apache/arrow/vector/BaseValueVector.java | 2 -- .../src/main/java/org/apache/arrow/vector/ValueVector.java | 6 ++++++ .../src/main/java/org/apache/arrow/vector/ZeroVector.java | 5 +++++ .../arrow/vector/complex/AbstractContainerVector.java | 5 +++++ 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index eef2708b5311..b513ba539927 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -690,4 +690,9 @@ public int hashCode(int index) { public OUT accept(VectorVisitor visitor, IN value) { return visitor.visit(this, value); } + + @Override + public String getName() { + return name; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 166af07055d0..f600d2ed1002 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -55,8 +55,6 @@ protected BaseValueVector(BufferAllocator allocator) { this.allocator = Preconditions.checkNotNull(allocator, "allocator cannot be null"); } - public abstract String getName(); - @Override public String toString() { return super.toString() + "[name = " + getName() + ", ...]"; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 1cfa81dbba3b..9a8647a4e2e7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -271,4 +271,10 @@ public interface ValueVector extends Closeable, Iterable { * @param the input data together with visitor. */ OUT accept(VectorVisitor visitor, IN value); + + /** + * Gets the name of the vector. + * @return the name of the vector. + */ + String getName(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index 41631b131a15..14c671ff12ad 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -265,4 +265,9 @@ public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { throw new UnsupportedOperationException(); } + + @Override + public String getName() { + return getField().getName(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java index b3fcd90744dd..3451d1cbcc42 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -132,4 +132,9 @@ public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { throw new UnsupportedOperationException(); } + + @Override + public String getName() { + return name; + } } From 155415c63de5b3d6257feec57f25be5208f8ebf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 1 Oct 2019 15:22:19 +0200 Subject: [PATCH 161/201] ARROW-6655: [Python] Filesystem bindings for S3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add support for S3FileSystem in the python bindings. - Fixed issue with reading all the content of an S3 object - Introduce `minio_server` fixture for parametrized testing of all filesystem implementations - Fixed s3fs parquet test and updated it to use minio_server fixture Closes #5423 from kszucs/s3 and squashes the following commits: 384c96052 Resolve review comments 73e6625f9 S3Options 98bd91ad1 remove commented tests db89859ca rename to s3fs 44784582a fix read() issue c1df10b92 initialization in first use 192ab6547 flake8 f70f9fbd8 remove minio-client dependency d399643dc simplify test suite fee57a9a4 remove accidentally committed files 751cfd429 resolve a couple of review comments; enum workaround 45436f7b3 cython flake8 38dcb88d0 rat c541b3e15 comment left 098048a8a more compat 00340ed4c fixture error handling 2be25ce29 auto initialize s3 on import 88e0c9f79 py2 compat 8585a6085 py2 compat d37228711 install minio in the conda-toolchain build 041cad42a executable flag fb0f2813a travis minio install script 8cbe0eeef travis osx 72e56a68f enable S3 in travis python builds 7800c75d8 appveyor flag 7daf5668d fix syntax error in travis script 68eb59161 enable PYARROW_WITH_S3 on appveyor 2cb19d1ff conditional import of test dependencies efa05d28e use minio for dask.s3fs test too f25ae5aed travis 9042c7e4c use S3FS_DIR 9ce7180d1 cmake format; fix orc cimport 45a2a17ba docstrings c0b91621e test requirements; flake8 44aedfd10 stat test a343950e2 testing suite 1551b525c wip dd41d21c2 imports 5200af16f s3 filesystem bindings Authored-by: Krisztián Szűcs Signed-off-by: Antoine Pitrou --- .travis.yml | 7 + ci/conda_env_python.yml | 1 + ci/cpp-msvc-build-main.bat | 3 + ci/travis_install_linux.sh | 8 - ci/travis_install_minio.sh | 35 ++ ci/travis_install_osx.sh | 2 +- ci/travis_script_python.sh | 8 + cpp/cmake_modules/ThirdpartyToolchain.cmake | 10 + cpp/src/arrow/filesystem/s3fs.cc | 6 + cpp/src/arrow/filesystem/s3fs.h | 2 +- cpp/src/arrow/filesystem/s3fs_test.cc | 11 +- python/CMakeLists.txt | 4 + python/pyarrow/_csv.pyx | 4 +- python/pyarrow/_cuda.pyx | 4 +- python/pyarrow/_fs.pxd | 68 +++ python/pyarrow/_fs.pyx | 69 +-- python/pyarrow/_json.pyx | 4 +- python/pyarrow/_orc.pxd | 4 +- python/pyarrow/_orc.pyx | 2 +- python/pyarrow/_parquet.pxd | 10 +- python/pyarrow/_parquet.pyx | 8 +- python/pyarrow/_s3fs.pyx | 173 +++++++ python/pyarrow/feather.pxi | 4 +- python/pyarrow/fs.py | 9 +- python/pyarrow/includes/libarrow.pxd | 167 +++++-- python/pyarrow/includes/libarrow_fs.pxd | 92 ---- python/pyarrow/includes/libarrow_s3fs.pxd | 60 +++ python/pyarrow/io-hdfs.pxi | 4 +- python/pyarrow/io.pxi | 56 +-- python/pyarrow/ipc.pxi | 28 +- python/pyarrow/lib.pxd | 24 +- python/pyarrow/s3fs.py | 27 ++ python/pyarrow/serialization.pxi | 6 +- python/pyarrow/tests/conftest.py | 55 +++ python/pyarrow/tests/test_flight.py | 12 +- python/pyarrow/tests/test_fs.py | 474 ++++++++++++-------- python/pyarrow/tests/test_parquet.py | 55 ++- python/pyarrow/util.py | 10 + python/requirements-test.txt | 1 + python/setup.py | 7 + 40 files changed, 1054 insertions(+), 480 deletions(-) create mode 100755 ci/travis_install_minio.sh create mode 100644 python/pyarrow/_fs.pxd create mode 100644 python/pyarrow/_s3fs.pyx delete mode 100644 python/pyarrow/includes/libarrow_fs.pxd create mode 100644 python/pyarrow/includes/libarrow_s3fs.pxd create mode 100644 python/pyarrow/s3fs.py diff --git a/.travis.yml b/.travis.yml index 4fc143a493db..990fd2256674 100644 --- a/.travis.yml +++ b/.travis.yml @@ -87,6 +87,7 @@ matrix: - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_minio.sh # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh @@ -110,12 +111,14 @@ matrix: - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_S3=1 # TODO(wesm): Run the benchmarks outside of Travis # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 before_script: - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ] && [ $ARROW_CI_DOCS_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_minio.sh - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh || travis_terminate 1 @@ -136,6 +139,7 @@ matrix: - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_FLIGHT=1 - ARROW_TRAVIS_ORC=1 + - ARROW_TRAVIS_S3=1 - ARROW_TRAVIS_PARQUET=1 # TODO(ARROW-4763): llvm and llvmdev packages are in conflict: # https://github.com/conda-forge/llvmdev-feedstock/issues/60 @@ -149,6 +153,7 @@ matrix: - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init + - $TRAVIS_BUILD_DIR/ci/travis_install_minio.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 @@ -161,6 +166,7 @@ matrix: cache: addons: env: + - ARROW_TRAVIS_S3=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN @@ -170,6 +176,7 @@ matrix: before_script: script: - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - $TRAVIS_BUILD_DIR/ci/travis_install_minio.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - name: "Java OpenJDK8 and OpenJDK11" language: cpp diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index a0cd737b326c..0e6e5bf8d5c3 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -22,6 +22,7 @@ numpy>=1.14 pandas pytest pytest-faulthandler +pytest-lazy-fixture pytz setuptools setuptools_scm=3.2.0 diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index b088e2eec763..b6d1b20a5109 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -98,6 +98,9 @@ pip install -r requirements.txt pickle5 set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS% set PYARROW_CMAKE_GENERATOR=%GENERATOR% +if "%ARROW_S3%" == "ON" ( + set PYARROW_WITH_S3=ON +) if "%ARROW_BUILD_FLIGHT%" == "ON" ( @rem ARROW-5441: bundling Arrow Flight libraries not implemented set PYARROW_BUNDLE_ARROW_CPP=OFF diff --git a/ci/travis_install_linux.sh b/ci/travis_install_linux.sh index a5283139a833..441d50d9b16c 100755 --- a/ci/travis_install_linux.sh +++ b/ci/travis_install_linux.sh @@ -42,14 +42,6 @@ if [ "$ARROW_TRAVIS_GANDIVA" == "1" ]; then sudo apt-get install -y -qq llvm-$ARROW_LLVM_MAJOR_VERSION-dev fi -if [ "$ARROW_TRAVIS_S3" == "1" ]; then - # Download the Minio S3 server into PATH - S3FS_DIR=~/.local/bin/ - mkdir -p $S3FS_DIR - wget --directory-prefix $S3FS_DIR https://dl.min.io/server/minio/release/linux-amd64/minio - chmod +x $S3FS_DIR/minio -fi - if [ "$ARROW_TRAVIS_USE_SYSTEM" == "1" ]; then if [ "$DISTRO_CODENAME" == "xenial" ]; then # TODO(ARROW-4761): Install libzstd-dev once we support zstd<1 diff --git a/ci/travis_install_minio.sh b/ci/travis_install_minio.sh new file mode 100755 index 000000000000..5459bb8bfc4a --- /dev/null +++ b/ci/travis_install_minio.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -x + +if [ "$ARROW_TRAVIS_S3" == "1" ]; then + # Download the Minio S3 server into PATH + if [ $TRAVIS_OS_NAME = "osx" ]; then + MINIO_URL=https://dl.min.io/server/minio/release/darwin-amd64/minio + else + MINIO_URL=https://dl.min.io/server/minio/release/linux-amd64/minio + fi + + S3FS_DIR=~/.local/bin/ + mkdir -p $S3FS_DIR + wget --directory-prefix $S3FS_DIR $MINIO_URL + chmod +x $S3FS_DIR/minio +fi \ No newline at end of file diff --git a/ci/travis_install_osx.sh b/ci/travis_install_osx.sh index 38e971710dea..2d79eb017ed5 100755 --- a/ci/travis_install_osx.sh +++ b/ci/travis_install_osx.sh @@ -40,4 +40,4 @@ if [ "$ARROW_CI_RUBY_AFFECTED" = "1" ]; then run_brew bundle --file=$TRAVIS_BUILD_DIR/cpp/Brewfile --verbose run_brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile --verbose rm ${brew_log_path} -fi +fi \ No newline at end of file diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6b095b878e59..8b3066f6e768 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -100,6 +100,10 @@ CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma parquet" +if [ "$ARROW_TRAVIS_S3" == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_S3=ON" +fi + if [ "$ARROW_TRAVIS_FLIGHT" == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_FLIGHT=ON" fi @@ -164,6 +168,9 @@ export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 export PYARROW_WITH_ORC=1 +if [ "$ARROW_TRAVIS_S3" == "1" ]; then + export PYARROW_WITH_S3=1 +fi if [ "$ARROW_TRAVIS_FLIGHT" == "1" ]; then export PYARROW_WITH_FLIGHT=1 fi @@ -177,6 +184,7 @@ python setup.py develop python -c "import pyarrow.parquet" python -c "import pyarrow.plasma" python -c "import pyarrow.orc" +python -c "import pyarrow.fs" # Ensure we do eagerly import pandas (or other expensive imports) python < scripts/test_imports.py diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ce495f177f8f..ace517124b65 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2542,6 +2542,16 @@ if(ARROW_S3) include_directories(SYSTEM ${AWSSDK_INCLUDE_DIR}) message(STATUS "Found AWS SDK headers: ${AWSSDK_INCLUDE_DIR}") message(STATUS "Found AWS SDK libraries: ${AWSSDK_LINK_LIBRARIES}") + + if(APPLE) + # CoreFoundation's path is hardcoded in the CMake files provided by + # aws-sdk-cpp to use the MacOSX SDK provided by XCode which makes + # XCode a hard dependency. Command Line Tools is often used instead + # of the full XCode suite, so let the linker to find it. + set_target_properties(AWS::aws-c-common + PROPERTIES INTERFACE_LINK_LIBRARIES + "-pthread;pthread;-framework CoreFoundation") + endif() endif() # Write out the package configurations. diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 8053c223ade6..3fb1cabae301 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -341,6 +341,12 @@ class ObjectInputFile : public io::RandomAccessFile { RETURN_NOT_OK(CheckClosed()); RETURN_NOT_OK(CheckPosition(position, "read")); + nbytes = std::min(nbytes, content_length_ - position); + if (nbytes == 0) { + *bytes_read = 0; + return Status::OK(); + } + // Read the desired range of bytes S3Model::GetObjectResult result; RETURN_NOT_OK(GetObjectRange(client_, path_, position, nbytes, &result)); diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 8dd3d87f8ff8..c4dacd84fc79 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -132,7 +132,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { std::unique_ptr impl_; }; -enum class S3LogLevel { Off, Fatal, Error, Warn, Info, Debug, Trace }; +enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace }; struct ARROW_EXPORT S3GlobalOptions { S3LogLevel log_level; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 8564901ae400..c868e8a2be85 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -687,7 +687,7 @@ TEST_F(TestS3FS, OpenInputStream) { TEST_F(TestS3FS, OpenInputFile) { std::shared_ptr file; std::shared_ptr buf; - int64_t nbytes = -1, pos = -1; + int64_t nbytes = -1, pos = -1, bytes_read = 0; // Non-existent ASSERT_RAISES(IOError, fs_->OpenInputFile("non-existent-bucket/somefile", &file)); @@ -712,6 +712,15 @@ TEST_F(TestS3FS, OpenInputFile) { AssertBufferEqual(*buf, "data"); ASSERT_OK(file->ReadAt(9, 20, &buf)); AssertBufferEqual(*buf, ""); + + char result[10]; + ASSERT_OK(file->ReadAt(2, 5, &bytes_read, &result)); + ASSERT_EQ(bytes_read, 5); + ASSERT_OK(file->ReadAt(5, 20, &bytes_read, &result)); + ASSERT_EQ(bytes_read, 4); + ASSERT_OK(file->ReadAt(9, 0, &bytes_read, &result)); + ASSERT_EQ(bytes_read, 0); + // Reading past end of file ASSERT_RAISES(IOError, file->ReadAt(10, 20, &buf)); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b2282a6b69bc..6925efd2d17e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -385,6 +385,10 @@ set(CYTHON_EXTENSIONS lib _fs _csv _json) set(LINK_LIBS arrow_shared arrow_python_shared) +if(PYARROW_BUILD_S3) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _s3fs) +endif() + if(PYARROW_BUILD_CUDA) # Arrow CUDA find_package(ArrowCuda) diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 25ff47d39b54..5dccad760f42 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -466,7 +466,7 @@ cdef class ConvertOptions: self.options.include_missing_columns = value -cdef _get_reader(input_file, shared_ptr[InputStream]* out): +cdef _get_reader(input_file, shared_ptr[CInputStream]* out): use_memory_map = False get_input_stream(input_file, use_memory_map, out) @@ -522,7 +522,7 @@ def read_csv(input_file, read_options=None, parse_options=None, Contents of the CSV file as a in-memory table. """ cdef: - shared_ptr[InputStream] stream + shared_ptr[CInputStream] stream CCSVReadOptions c_read_options CCSVParseOptions c_parse_options CCSVConvertOptions c_convert_options diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx index a9f51b0a654f..e0cad68b2bfe 100644 --- a/python/pyarrow/_cuda.pyx +++ b/python/pyarrow/_cuda.pyx @@ -729,7 +729,7 @@ cdef class BufferReader(NativeFile): self.buffer = obj self.reader = new CCudaBufferReader(self.buffer.buffer) self.set_random_access_file( - shared_ptr[RandomAccessFile](self.reader)) + shared_ptr[CRandomAccessFile](self.reader)) self.is_readable = True def read_buffer(self, nbytes=None): @@ -776,7 +776,7 @@ cdef class BufferWriter(NativeFile): def __cinit__(self, CudaBuffer buffer): self.buffer = buffer self.writer = new CCudaBufferWriter(self.buffer.cuda_buffer) - self.set_output_stream(shared_ptr[OutputStream](self.writer)) + self.set_output_stream(shared_ptr[COutputStream](self.writer)) self.is_writable = True def writeat(self, int64_t position, object data): diff --git a/python/pyarrow/_fs.pxd b/python/pyarrow/_fs.pxd new file mode 100644 index 000000000000..11b5769f854e --- /dev/null +++ b/python/pyarrow/_fs.pxd @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +import six + +from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport PyDateTime_from_TimePoint +from pyarrow.lib import _detect_compression +from pyarrow.lib cimport * + + +cpdef enum FileType: + NonExistent = CFileType_NonExistent + Unknown = CFileType_Unknown + File = CFileType_File + Directory = CFileType_Directory + + +cdef class FileStats: + cdef: + CFileStats stats + + @staticmethod + cdef FileStats wrap(CFileStats stats) + + +cdef class Selector: + cdef: + CSelector selector + + +cdef class FileSystem: + cdef: + shared_ptr[CFileSystem] wrapped + CFileSystem* fs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) + + +cdef class LocalFileSystem(FileSystem): + cdef: + CLocalFileSystem* localfs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) + + +cdef class SubTreeFileSystem(FileSystem): + cdef: + CSubTreeFileSystem* subtreefs + + cdef init(self, const shared_ptr[CFileSystem]& wrapped) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 769ca8d7391f..39079aee1051 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -22,17 +22,8 @@ import six from pyarrow.compat import frombytes, tobytes from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport PyDateTime_from_TimePoint -from pyarrow.includes.libarrow_fs cimport * -from pyarrow.util import _stringify_path from pyarrow.lib import _detect_compression -from pyarrow.lib cimport ( - check_status, - NativeFile, - BufferedOutputStream, - BufferedInputStream, - CompressedInputStream, - CompressedOutputStream -) +from pyarrow.lib cimport * cdef inline c_string _path_as_bytes(path) except *: @@ -46,20 +37,12 @@ cdef inline c_string _path_as_bytes(path) except *: return tobytes(path) -cpdef enum FileType: - NonExistent = CFileType_NonExistent - Unknown = CFileType_Unknown - File = CFileType_File - Directory = CFileType_Directory - - cdef class FileStats: """FileSystem entry stats""" - cdef CFileStats stats - def __init__(self): - raise TypeError('dont initialize me') + raise TypeError("FileStats cannot be instantiated directly, use " + "FileSystem.get_target_stats method instead.") @staticmethod cdef FileStats wrap(CFileStats stats): @@ -115,9 +98,7 @@ cdef class FileStats: Only regular files are guaranteed to have a size. """ if self.stats.type() != CFileType_File: - raise ValueError( - 'Only regular files are guaranteed to have a size' - ) + return None return self.stats.size() @property @@ -146,7 +127,7 @@ cdef class Selector: Parameters ---------- - base_dir : str or pathlib.Path + base_dir : str The directory in which to select files. Relative paths also work, use '.' for the current directory and '..' for the parent. allow_non_existent : bool, default False @@ -156,7 +137,6 @@ cdef class Selector: recursive : bool, default False Whether to recurse into subdirectories. """ - cdef CSelector selector def __init__(self, base_dir, bint allow_non_existent=False, bint recursive=False): @@ -192,10 +172,6 @@ cdef class Selector: cdef class FileSystem: """Abstract file system API""" - cdef: - shared_ptr[CFileSystem] wrapped - CFileSystem* fs - def __init__(self): raise TypeError("FileSystem is an abstract class, instantiate one of " "the subclasses instead: LocalFileSystem or " @@ -249,7 +225,7 @@ cdef class FileSystem: Parameters ---------- - path : str or pathlib.Path + path : str The path of the new directory. recursive: bool, default True Create nested directories as well. @@ -263,7 +239,7 @@ cdef class FileSystem: Parameters ---------- - path : str or pathlib.Path + path : str The path of the directory to be deleted. """ cdef c_string directory = _path_as_bytes(path) @@ -280,9 +256,9 @@ cdef class FileSystem: Parameters ---------- - src : str or pathlib.Path + src : str The path of the file or the directory to be moved. - dest : str or pathlib.Path + dest : str The destination path where the file or directory is moved to. """ cdef: @@ -299,9 +275,9 @@ cdef class FileSystem: Parameters ---------- - src : str or pathlib.Path + src : str The path of the file to be copied from. - dest : str or pathlib.Path + dest : str The destination path where the file is copied to. """ cdef: @@ -315,7 +291,7 @@ cdef class FileSystem: Parameters ---------- - path : str or pathlib.Path + path : str The path of the file to be deleted. """ cdef c_string file = _path_as_bytes(path) @@ -345,7 +321,7 @@ cdef class FileSystem: Parameters ---------- - path : Union[str, pathlib.Path] + path : str The source to open for reading. Returns @@ -369,7 +345,7 @@ cdef class FileSystem: Parameters ---------- - source: str or pathlib.Path + source: str The source to open for reading. compression: str optional, default 'detect' The compression algorithm to use for on-the-fly decompression. @@ -407,7 +383,7 @@ cdef class FileSystem: Parameters ---------- - path : str or pathlib.Path + path : str The source to open for writing. compression: str optional, default 'detect' The compression algorithm to use for on-the-fly compression. @@ -445,7 +421,7 @@ cdef class FileSystem: Parameters ---------- - path : str or pathlib.Path + path : str The source to open for writing. compression: str optional, default 'detect' The compression algorithm to use for on-the-fly compression. @@ -484,9 +460,6 @@ cdef class LocalFileSystem(FileSystem): except when deleting an entry). """ - cdef: - CLocalFileSystem* localfs - def __init__(self): cdef shared_ptr[CLocalFileSystem] wrapped wrapped = make_shared[CLocalFileSystem]() @@ -506,10 +479,14 @@ cdef class SubTreeFileSystem(FileSystem): Note, that this makes no security guarantee. For example, symlinks may allow to "escape" the subtree and access other parts of the underlying filesystem. - """ - cdef: - CSubTreeFileSystem* subtreefs + Parameters + ---------- + base_path: str + The root of the subtree. + base_fs: FileSystem + FileSystem object the operations delegated to. + """ def __init__(self, base_path, FileSystem base_fs): cdef: diff --git a/python/pyarrow/_json.pyx b/python/pyarrow/_json.pyx index ffbf01c09e7c..da3588a5e625 100644 --- a/python/pyarrow/_json.pyx +++ b/python/pyarrow/_json.pyx @@ -135,7 +135,7 @@ cdef class ParseOptions: self.options.newlines_in_values = value -cdef _get_reader(input_file, shared_ptr[InputStream]* out): +cdef _get_reader(input_file, shared_ptr[CInputStream]* out): use_memory_map = False get_input_stream(input_file, use_memory_map, out) @@ -175,7 +175,7 @@ def read_json(input_file, read_options=None, parse_options=None, Contents of the JSON file as a in-memory table. """ cdef: - shared_ptr[InputStream] stream + shared_ptr[CInputStream] stream CJSONReadOptions c_read_options CJSONParseOptions c_parse_options shared_ptr[CJSONReader] reader diff --git a/python/pyarrow/_orc.pxd b/python/pyarrow/_orc.pxd index ebbf8beda828..649fe8248f41 100644 --- a/python/pyarrow/_orc.pxd +++ b/python/pyarrow/_orc.pxd @@ -28,7 +28,7 @@ from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, CKeyValueMetadata, CRecordBatch, CTable, - RandomAccessFile, OutputStream, + CRandomAccessFile, COutputStream, TimeUnit) @@ -37,7 +37,7 @@ cdef extern from "arrow/adapters/orc/adapter.h" \ cdef cppclass ORCFileReader: @staticmethod - CStatus Open(const shared_ptr[RandomAccessFile]& file, + CStatus Open(const shared_ptr[CRandomAccessFile]& file, CMemoryPool* pool, unique_ptr[ORCFileReader]* reader) diff --git a/python/pyarrow/_orc.pyx b/python/pyarrow/_orc.pyx index c9f5b2e158d6..0ee3ca632e46 100644 --- a/python/pyarrow/_orc.pyx +++ b/python/pyarrow/_orc.pyx @@ -46,7 +46,7 @@ cdef class ORCReader: def open(self, object source, c_bool use_memory_map=True): cdef: - shared_ptr[RandomAccessFile] rd_handle + shared_ptr[CRandomAccessFile] rd_handle self.source = source diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 19fb214c8f04..5cf3ff5931b4 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -24,7 +24,7 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport (CChunkedArray, CSchema, CStatus, CTable, CMemoryPool, CBuffer, CKeyValueMetadata, - RandomAccessFile, OutputStream, + CRandomAccessFile, COutputStream, TimeUnit) @@ -316,7 +316,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: unique_ptr[CRowGroupMetaData] RowGroup(int i) const SchemaDescriptor* schema() shared_ptr[const CKeyValueMetadata] key_value_metadata() const - void WriteTo(OutputStream* dst) const + void WriteTo(COutputStream* dst) const cdef shared_ptr[CFileMetaData] CFileMetaData_Make \ " parquet::FileMetaData::Make"(const void* serialized_metadata, @@ -406,7 +406,7 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReaderBuilder: FileReaderBuilder() - CStatus Open(const shared_ptr[RandomAccessFile]& file, + CStatus Open(const shared_ptr[CRandomAccessFile]& file, const CReaderProperties& properties, const shared_ptr[CFileMetaData]& metadata) @@ -435,7 +435,7 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: @staticmethod CStatus Open(const CSchema& schema, CMemoryPool* pool, - const shared_ptr[OutputStream]& sink, + const shared_ptr[COutputStream]& sink, const shared_ptr[WriterProperties]& properties, const shared_ptr[ArrowWriterProperties]& arrow_properties, unique_ptr[FileWriter]* writer) @@ -448,4 +448,4 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: CStatus WriteMetaDataFile( const CFileMetaData& file_metadata, - const OutputStream* sink) + const COutputStream* sink) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 3d62c0eb13e6..f93def8a67ee 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -568,7 +568,7 @@ cdef class FileMetaData: def __reduce__(self): cdef: NativeFile sink = BufferOutputStream() - OutputStream* c_sink = sink.get_output_stream().get() + COutputStream* c_sink = sink.get_output_stream().get() with nogil: self._metadata.WriteTo(c_sink) @@ -694,7 +694,7 @@ cdef class FileMetaData: Write the metadata object to a metadata-only file """ cdef: - shared_ptr[OutputStream] sink + shared_ptr[COutputStream] sink c_string c_where try: @@ -1010,7 +1010,7 @@ cdef class ParquetReader: read_dictionary=None, FileMetaData metadata=None, int buffer_size=0): cdef: - shared_ptr[RandomAccessFile] rd_handle + shared_ptr[CRandomAccessFile] rd_handle shared_ptr[CFileMetaData] c_metadata CReaderProperties properties = default_reader_properties() ArrowReaderProperties arrow_props = ( @@ -1202,7 +1202,7 @@ cdef class ParquetReader: cdef class ParquetWriter: cdef: unique_ptr[FileWriter] writer - shared_ptr[OutputStream] sink + shared_ptr[COutputStream] sink bint own_sink cdef readonly: diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx new file mode 100644 index 000000000000..d1f820e51958 --- /dev/null +++ b/python/pyarrow/_s3fs.pyx @@ -0,0 +1,173 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +import six + +from pyarrow.lib cimport check_status +from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_s3fs cimport * +from pyarrow._fs cimport FileSystem + + +cpdef enum S3LogLevel: + Off = CS3LogLevel_Off + Fatal = CS3LogLevel_Fatal + Error = CS3LogLevel_Error + Warn = CS3LogLevel_Warn + Info = CS3LogLevel_Info + Debug = CS3LogLevel_Debug + Trace = CS3LogLevel_Trace + + +def initialize_s3(S3LogLevel log_level=S3LogLevel.Error): + cdef CS3GlobalOptions options + options.log_level = log_level + check_status(CInitializeS3(options)) + + +def finalize_s3(): + check_status(CFinalizeS3()) + + +cdef class S3Options: + """Options for S3FileSystem. + + If neither access_key nor secret_key are provided then attempts to + initialize from AWS environment variables, otherwise both access_key and + secret_key must be provided. + + Parameters + ---------- + access_key: str, default None + AWS Access Key ID. Pass None to use the standard AWS environment + variables and/or configuration file. + secret_key: str, default None + AWS Secret Access key. Pass None to use the standard AWS environment + variables and/or configuration file. + region: str, default 'us-east-1' + AWS region to connect to. + scheme: str, default 'https' + S3 connection transport scheme. + endpoint_override: str, default None + Override region with a connect string such as "localhost:9000" + background_writes: boolean, default True + Whether OutputStream writes will be issued in the background, without + blocking. + """ + cdef: + CS3Options options + + # Avoid mistakingly creating attributes + __slots__ = () + + def __init__(self, access_key=None, secret_key=None, region=None, + scheme=None, endpoint_override=None, background_writes=None): + if access_key is not None and secret_key is None: + raise ValueError( + 'In order to initialize with explicit credentials both ' + 'access_key and secret_key must be provided, ' + '`secret_key` is not set.' + ) + elif access_key is None and secret_key is not None: + raise ValueError( + 'In order to initialize with explicit credentials both ' + 'access_key and secret_key must be provided, ' + '`access_key` is not set.' + ) + elif access_key is not None or secret_key is not None: + self.options = CS3Options.FromAccessKey( + tobytes(access_key), + tobytes(secret_key) + ) + else: + self.options = CS3Options.Defaults() + + if region is not None: + self.region = region + if scheme is not None: + self.scheme = scheme + if endpoint_override is not None: + self.endpoint_override = endpoint_override + if background_writes is not None: + self.background_writes = background_writes + + @property + def region(self): + """AWS region to connect to.""" + return frombytes(self.options.region) + + @region.setter + def region(self, value): + self.options.region = tobytes(value) + + @property + def scheme(self): + """S3 connection transport scheme.""" + return frombytes(self.options.scheme) + + @scheme.setter + def scheme(self, value): + self.options.scheme = tobytes(value) + + @property + def endpoint_override(self): + """Override region with a connect string such as localhost:9000""" + return frombytes(self.options.endpoint_override) + + @endpoint_override.setter + def endpoint_override(self, value): + self.options.endpoint_override = tobytes(value) + + @property + def background_writes(self): + """OutputStream writes will be issued in the background""" + return self.options.background_writes + + @background_writes.setter + def background_writes(self, bint value): + self.options.background_writes = value + + +cdef class S3FileSystem(FileSystem): + """S3-backed FileSystem implementation + + Note: S3 buckets are special and the operations available on them may be + limited or more expensive than desired. + + Parameters + ---------- + options: S3Options, default None + Options for connecting to S3. If None is passed then attempts to + initialize the connection from AWS environment variables. + """ + + cdef: + CS3FileSystem* s3fs + + def __init__(self, S3Options options=None): + cdef shared_ptr[CS3FileSystem] wrapped + options = options or S3Options() + check_status(CS3FileSystem.Make(options.options, &wrapped)) + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.s3fs = wrapped.get() diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 6fd13bc04b46..8700f67ae621 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -34,7 +34,7 @@ cdef class FeatherWriter: self.num_rows = -1 def open(self, object dest): - cdef shared_ptr[OutputStream] sink + cdef shared_ptr[COutputStream] sink get_writer(dest, &sink) with nogil: @@ -76,7 +76,7 @@ cdef class FeatherReader: pass def open(self, source, c_bool use_memory_map=True): - cdef shared_ptr[RandomAccessFile] reader + cdef shared_ptr[CRandomAccessFile] reader get_reader(source, use_memory_map, &reader) with nogil: diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index cd5263acbcad..5f257d07f300 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -17,4 +17,11 @@ from __future__ import absolute_import -from pyarrow._fs import * # noqa +from pyarrow._fs import ( # noqa + Selector, + FileType, + FileStats, + FileSystem, + LocalFileSystem, + SubTreeFileSystem +) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 69dafa4e46f2..82085487eda3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -765,13 +765,16 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: CStatus Write(const uint8_t* data, int64_t nbytes) CStatus Flush() - cdef cppclass OutputStream(FileInterface, Writable): + cdef cppclass COutputStream" arrow::io::OutputStream"(FileInterface, + Writable): pass - cdef cppclass InputStream(FileInterface, Readable): + cdef cppclass CInputStream" arrow::io::InputStream"(FileInterface, + Readable): pass - cdef cppclass RandomAccessFile(InputStream, Seekable): + cdef cppclass CRandomAccessFile" arrow::io::RandomAccessFile"(CInputStream, + Seekable): CStatus GetSize(int64_t* size) CStatus ReadAt(int64_t position, int64_t nbytes, @@ -780,24 +783,24 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: shared_ptr[CBuffer]* out) c_bool supports_zero_copy() - cdef cppclass WritableFile(OutputStream, Seekable): + cdef cppclass WritableFile(COutputStream, Seekable): CStatus WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) - cdef cppclass ReadWriteFileInterface(RandomAccessFile, + cdef cppclass ReadWriteFileInterface(CRandomAccessFile, WritableFile): pass - cdef cppclass FileSystem: + cdef cppclass CIOFileSystem" arrow::io::FileSystem": CStatus Stat(const c_string& path, FileStatistics* stat) - cdef cppclass FileOutputStream(OutputStream): + cdef cppclass FileOutputStream(COutputStream): @staticmethod - CStatus Open(const c_string& path, shared_ptr[OutputStream]* file) + CStatus Open(const c_string& path, shared_ptr[COutputStream]* file) int file_descriptor() - cdef cppclass ReadableFile(RandomAccessFile): + cdef cppclass ReadableFile(CRandomAccessFile): @staticmethod CStatus Open(const c_string& path, shared_ptr[ReadableFile]* file) @@ -823,46 +826,46 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: int file_descriptor() cdef cppclass CCompressedInputStream \ - " arrow::io::CompressedInputStream"(InputStream): + " arrow::io::CompressedInputStream"(CInputStream): @staticmethod CStatus Make(CMemoryPool* pool, CCodec* codec, - shared_ptr[InputStream] raw, + shared_ptr[CInputStream] raw, shared_ptr[CCompressedInputStream]* out) @staticmethod - CStatus Make(CCodec* codec, shared_ptr[InputStream] raw, + CStatus Make(CCodec* codec, shared_ptr[CInputStream] raw, shared_ptr[CCompressedInputStream]* out) cdef cppclass CCompressedOutputStream \ - " arrow::io::CompressedOutputStream"(OutputStream): + " arrow::io::CompressedOutputStream"(COutputStream): @staticmethod CStatus Make(CMemoryPool* pool, CCodec* codec, - shared_ptr[OutputStream] raw, + shared_ptr[COutputStream] raw, shared_ptr[CCompressedOutputStream]* out) @staticmethod - CStatus Make(CCodec* codec, shared_ptr[OutputStream] raw, + CStatus Make(CCodec* codec, shared_ptr[COutputStream] raw, shared_ptr[CCompressedOutputStream]* out) cdef cppclass CBufferedInputStream \ - " arrow::io::BufferedInputStream"(InputStream): + " arrow::io::BufferedInputStream"(CInputStream): @staticmethod CStatus Create(int64_t buffer_size, CMemoryPool* pool, - shared_ptr[InputStream] raw, + shared_ptr[CInputStream] raw, shared_ptr[CBufferedInputStream]* out) - shared_ptr[InputStream] Detach() + shared_ptr[CInputStream] Detach() cdef cppclass CBufferedOutputStream \ - " arrow::io::BufferedOutputStream"(OutputStream): + " arrow::io::BufferedOutputStream"(COutputStream): @staticmethod CStatus Create(int64_t buffer_size, CMemoryPool* pool, - shared_ptr[OutputStream] raw, + shared_ptr[COutputStream] raw, shared_ptr[CBufferedOutputStream]* out) - CStatus Detach(shared_ptr[OutputStream]* raw) + CStatus Detach(shared_ptr[COutputStream]* raw) # ---------------------------------------------------------------------- # HDFS @@ -894,13 +897,14 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: int64_t block_size int16_t permissions - cdef cppclass HdfsReadableFile(RandomAccessFile): + cdef cppclass HdfsReadableFile(CRandomAccessFile): pass - cdef cppclass HdfsOutputStream(OutputStream): + cdef cppclass HdfsOutputStream(COutputStream): pass - cdef cppclass CHadoopFileSystem" arrow::io::HadoopFileSystem"(FileSystem): + cdef cppclass CHadoopFileSystem \ + "arrow::io::HadoopFileSystem"(CIOFileSystem): @staticmethod CStatus Connect(const HdfsConnectionConfig* config, shared_ptr[CHadoopFileSystem]* client) @@ -936,16 +940,16 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: shared_ptr[HdfsOutputStream]* handle) cdef cppclass CBufferReader \ - " arrow::io::BufferReader"(RandomAccessFile): + " arrow::io::BufferReader"(CRandomAccessFile): CBufferReader(const shared_ptr[CBuffer]& buffer) CBufferReader(const uint8_t* data, int64_t nbytes) cdef cppclass CBufferOutputStream \ - " arrow::io::BufferOutputStream"(OutputStream): + " arrow::io::BufferOutputStream"(COutputStream): CBufferOutputStream(const shared_ptr[CResizableBuffer]& buffer) cdef cppclass CMockOutputStream \ - " arrow::io::MockOutputStream"(OutputStream): + " arrow::io::MockOutputStream"(COutputStream): CMockOutputStream() int64_t GetExtentBytesWritten() @@ -958,6 +962,71 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: void set_memcopy_threshold(int64_t threshold) +cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: + + ctypedef enum CFileType "arrow::fs::FileType": + CFileType_NonExistent "arrow::fs::FileType::NonExistent" + CFileType_Unknown "arrow::fs::FileType::Unknown" + CFileType_File "arrow::fs::FileType::File" + CFileType_Directory "arrow::fs::FileType::Directory" + + cdef cppclass CTimePoint "arrow::fs::TimePoint": + pass + + cdef cppclass CFileStats "arrow::fs::FileStats": + CFileStats() + CFileStats(CFileStats&&) + CFileStats& operator=(CFileStats&&) + CFileStats(const CFileStats&) + CFileStats& operator=(const CFileStats&) + + CFileType type() + void set_type(CFileType type) + c_string path() + void set_path(const c_string& path) + c_string base_name() + int64_t size() + void set_size(int64_t size) + c_string extension() + CTimePoint mtime() + void set_mtime(CTimePoint mtime) + + cdef cppclass CSelector "arrow::fs::Selector": + CSelector() + c_string base_dir + c_bool allow_non_existent + c_bool recursive + + cdef cppclass CFileSystem "arrow::fs::FileSystem": + CStatus GetTargetStats(const c_string& path, CFileStats* out) + CStatus GetTargetStats(const vector[c_string]& paths, + vector[CFileStats]* out) + CStatus GetTargetStats(const CSelector& select, + vector[CFileStats]* out) + CStatus CreateDir(const c_string& path, c_bool recursive) + CStatus DeleteDir(const c_string& path) + CStatus DeleteFile(const c_string& path) + CStatus DeleteFiles(const vector[c_string]& paths) + CStatus Move(const c_string& src, const c_string& dest) + CStatus CopyFile(const c_string& src, const c_string& dest) + CStatus OpenInputStream(const c_string& path, + shared_ptr[CInputStream]* out) + CStatus OpenInputFile(const c_string& path, + shared_ptr[CRandomAccessFile]* out) + CStatus OpenOutputStream(const c_string& path, + shared_ptr[COutputStream]* out) + CStatus OpenAppendStream(const c_string& path, + shared_ptr[COutputStream]* out) + + cdef cppclass CLocalFileSystem "arrow::fs::LocalFileSystem"(CFileSystem): + LocalFileSystem() + + cdef cppclass CSubTreeFileSystem \ + "arrow::fs::SubTreeFileSystem"(CFileSystem): + CSubTreeFileSystem(const c_string& base_path, + shared_ptr[CFileSystem] base_fs) + + cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: enum MessageType" arrow::ipc::Message::Type": MessageType_SCHEMA" arrow::ipc::Message::SCHEMA" @@ -1001,14 +1070,14 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: MetadataVersion metadata_version() MessageType type() - CStatus SerializeTo(OutputStream* stream, const CIpcOptions& options, + CStatus SerializeTo(COutputStream* stream, const CIpcOptions& options, int64_t* output_length) c_string FormatMessageType(MessageType type) cdef cppclass CMessageReader" arrow::ipc::MessageReader": @staticmethod - unique_ptr[CMessageReader] Open(const shared_ptr[InputStream]& stream) + unique_ptr[CMessageReader] Open(const shared_ptr[CInputStream]& stream) CStatus ReadNextMessage(unique_ptr[CMessage]* out) @@ -1020,7 +1089,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: cdef cppclass CRecordBatchStreamReader \ " arrow::ipc::RecordBatchStreamReader"(CRecordBatchReader): @staticmethod - CStatus Open(const InputStream* stream, + CStatus Open(const CInputStream* stream, shared_ptr[CRecordBatchReader]* out) @staticmethod @@ -1031,24 +1100,24 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: " arrow::ipc::RecordBatchStreamWriter"(CRecordBatchWriter): @staticmethod CResult[shared_ptr[CRecordBatchWriter]] Open( - OutputStream* sink, const shared_ptr[CSchema]& schema, + COutputStream* sink, const shared_ptr[CSchema]& schema, CIpcOptions& options) cdef cppclass CRecordBatchFileWriter \ " arrow::ipc::RecordBatchFileWriter"(CRecordBatchWriter): @staticmethod CResult[shared_ptr[CRecordBatchWriter]] Open( - OutputStream* sink, const shared_ptr[CSchema]& schema, + COutputStream* sink, const shared_ptr[CSchema]& schema, CIpcOptions& options) cdef cppclass CRecordBatchFileReader \ " arrow::ipc::RecordBatchFileReader": @staticmethod - CStatus Open(RandomAccessFile* file, + CStatus Open(CRandomAccessFile* file, shared_ptr[CRecordBatchFileReader]* out) @staticmethod - CStatus Open2" Open"(RandomAccessFile* file, + CStatus Open2" Open"(CRandomAccessFile* file, int64_t footer_offset, shared_ptr[CRecordBatchFileReader]* out) @@ -1058,16 +1127,16 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: CStatus ReadRecordBatch(int i, shared_ptr[CRecordBatch]* batch) - CStatus ReadMessage(InputStream* stream, unique_ptr[CMessage]* message) + CStatus ReadMessage(CInputStream* stream, unique_ptr[CMessage]* message) CStatus GetRecordBatchSize(const CRecordBatch& batch, int64_t* size) CStatus GetTensorSize(const CTensor& tensor, int64_t* size) - CStatus WriteTensor(const CTensor& tensor, OutputStream* dst, + CStatus WriteTensor(const CTensor& tensor, COutputStream* dst, int32_t* metadata_length, int64_t* body_length) - CStatus ReadTensor(InputStream* stream, shared_ptr[CTensor]* out) + CStatus ReadTensor(CInputStream* stream, shared_ptr[CTensor]* out) CStatus ReadRecordBatch(const CMessage& message, const shared_ptr[CSchema]& schema, @@ -1082,16 +1151,16 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: CMemoryPool* pool, shared_ptr[CBuffer]* out) - CStatus ReadSchema(InputStream* stream, CDictionaryMemo* dictionary_memo, + CStatus ReadSchema(CInputStream* stream, CDictionaryMemo* dictionary_memo, shared_ptr[CSchema]* out) CStatus ReadRecordBatch(const shared_ptr[CSchema]& schema, CDictionaryMemo* dictionary_memo, - InputStream* stream, + CInputStream* stream, shared_ptr[CRecordBatch]* out) - CStatus AlignStream(InputStream* stream, int64_t alignment) - CStatus AlignStream(OutputStream* stream, int64_t alignment) + CStatus AlignStream(CInputStream* stream, int64_t alignment) + CStatus AlignStream(COutputStream* stream, int64_t alignment) cdef CStatus GetRecordBatchPayload\ " arrow::ipc::internal::GetRecordBatchPayload"( @@ -1102,7 +1171,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: cdef cppclass CFeatherWriter" arrow::ipc::feather::TableWriter": @staticmethod - CStatus Open(const shared_ptr[OutputStream]& stream, + CStatus Open(const shared_ptr[COutputStream]& stream, unique_ptr[CFeatherWriter]* out) void SetDescription(const c_string& desc) @@ -1113,7 +1182,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: cdef cppclass CFeatherReader" arrow::ipc::feather::TableReader": @staticmethod - CStatus Open(const shared_ptr[RandomAccessFile]& file, + CStatus Open(const shared_ptr[CRandomAccessFile]& file, unique_ptr[CFeatherReader]* out) c_string GetDescription() @@ -1172,7 +1241,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: cdef cppclass CCSVReader" arrow::csv::TableReader": @staticmethod - CStatus Make(CMemoryPool*, shared_ptr[InputStream], + CStatus Make(CMemoryPool*, shared_ptr[CInputStream], CCSVReadOptions, CCSVParseOptions, CCSVConvertOptions, shared_ptr[CCSVReader]* out) @@ -1200,7 +1269,7 @@ cdef extern from "arrow/json/reader.h" namespace "arrow::json" nogil: cdef cppclass CJSONReader" arrow::json::TableReader": @staticmethod - CStatus Make(CMemoryPool*, shared_ptr[InputStream], + CStatus Make(CMemoryPool*, shared_ptr[CInputStream], CJSONReadOptions, CJSONParseOptions, shared_ptr[CJSONReader]* out) @@ -1379,10 +1448,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus Make(const uint8_t* data, int64_t size, object base, shared_ptr[CBuffer]* out) - cdef cppclass PyReadableFile(RandomAccessFile): + cdef cppclass PyReadableFile(CRandomAccessFile): PyReadableFile(object fo) - cdef cppclass PyOutputStream(OutputStream): + cdef cppclass PyOutputStream(COutputStream): PyOutputStream(object fo) cdef cppclass PandasOptions: @@ -1398,7 +1467,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CRecordBatch] batch vector[shared_ptr[CTensor]] tensors - CStatus WriteTo(OutputStream* dst) + CStatus WriteTo(COutputStream* dst) CStatus GetComponents(CMemoryPool* pool, PyObject** dst) CStatus SerializeObject(object context, object sequence, @@ -1408,7 +1477,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: const CSerializedPyObject& obj, PyObject* base, PyObject** out) - CStatus ReadSerializedObject(RandomAccessFile* src, + CStatus ReadSerializedObject(CRandomAccessFile* src, CSerializedPyObject* out) CStatus GetSerializedFromComponents(int num_tensors, int num_ndarrays, diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd deleted file mode 100644 index f54a2e50357e..000000000000 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ /dev/null @@ -1,92 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# distutils: language = c++ - -from libcpp.functional cimport function - -from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport ( - InputStream as CInputStream, - OutputStream as COutputStream, - RandomAccessFile as CRandomAccessFile -) - - -cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: - - enum CFileType "arrow::fs::FileType": - CFileType_NonExistent "arrow::fs::FileType::NonExistent" - CFileType_Unknown "arrow::fs::FileType::Unknown" - CFileType_File "arrow::fs::FileType::File" - CFileType_Directory "arrow::fs::FileType::Directory" - - cdef cppclass CTimePoint "arrow::fs::TimePoint": - pass - - cdef cppclass CFileStats "arrow::fs::FileStats": - CFileStats() - CFileStats(CFileStats&&) - CFileStats& operator=(CFileStats&&) - CFileStats(const CFileStats&) - CFileStats& operator=(const CFileStats&) - - CFileType type() - void set_type(CFileType type) - c_string path() - void set_path(const c_string& path) - c_string base_name() - int64_t size() - void set_size(int64_t size) - c_string extension() - CTimePoint mtime() - void set_mtime(CTimePoint mtime) - - cdef cppclass CSelector "arrow::fs::Selector": - CSelector() - c_string base_dir - c_bool allow_non_existent - c_bool recursive - - cdef cppclass CFileSystem "arrow::fs::FileSystem": - CStatus GetTargetStats(const c_string& path, CFileStats* out) - CStatus GetTargetStats(const vector[c_string]& paths, - vector[CFileStats]* out) - CStatus GetTargetStats(const CSelector& select, - vector[CFileStats]* out) - CStatus CreateDir(const c_string& path, c_bool recursive) - CStatus DeleteDir(const c_string& path) - CStatus DeleteFile(const c_string& path) - CStatus DeleteFiles(const vector[c_string]& paths) - CStatus Move(const c_string& src, const c_string& dest) - CStatus CopyFile(const c_string& src, const c_string& dest) - CStatus OpenInputStream(const c_string& path, - shared_ptr[CInputStream]* out) - CStatus OpenInputFile(const c_string& path, - shared_ptr[CRandomAccessFile]* out) - CStatus OpenOutputStream(const c_string& path, - shared_ptr[COutputStream]* out) - CStatus OpenAppendStream(const c_string& path, - shared_ptr[COutputStream]* out) - - cdef cppclass CLocalFileSystem "arrow::fs::LocalFileSystem"(CFileSystem): - LocalFileSystem() - - cdef cppclass CSubTreeFileSystem \ - "arrow::fs::SubTreeFileSystem"(CFileSystem): - CSubTreeFileSystem(const c_string& base_path, - shared_ptr[CFileSystem] base_fs) diff --git a/python/pyarrow/includes/libarrow_s3fs.pxd b/python/pyarrow/includes/libarrow_s3fs.pxd new file mode 100644 index 000000000000..8dc109c5e6e3 --- /dev/null +++ b/python/pyarrow/includes/libarrow_s3fs.pxd @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from libcpp.functional cimport function + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport CFileSystem + +cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: + + ctypedef enum CS3LogLevel "arrow::fs::S3LogLevel": + CS3LogLevel_Off "arrow::fs::S3LogLevel::Off" + CS3LogLevel_Fatal "arrow::fs::S3LogLevel::Fatal" + CS3LogLevel_Error "arrow::fs::S3LogLevel::Error" + CS3LogLevel_Warn "arrow::fs::S3LogLevel::Warn" + CS3LogLevel_Info "arrow::fs::S3LogLevel::Info" + CS3LogLevel_Debug "arrow::fs::S3LogLevel::Debug" + CS3LogLevel_Trace "arrow::fs::S3LogLevel::Trace" + + cdef struct CS3GlobalOptions "arrow::fs::S3GlobalOptions": + CS3LogLevel log_level + + cdef cppclass CS3Options "arrow::fs::S3Options": + c_string region + c_string endpoint_override + c_string scheme + c_bool background_writes + void ConfigureDefaultCredentials() + void ConfigureAccessKey(const c_string& access_key, + const c_string& secret_key) + + @staticmethod + CS3Options Defaults() + @staticmethod + CS3Options FromAccessKey(const c_string& access_key, + const c_string& secret_key) + + cdef cppclass CS3FileSystem "arrow::fs::S3FileSystem"(CFileSystem): + @staticmethod + CStatus Make(const CS3Options& options, shared_ptr[CS3FileSystem]* out) + + cdef CStatus CInitializeS3 "arrow::fs::InitializeS3"( + const CS3GlobalOptions& options) + cdef CStatus CFinalizeS3 "arrow::fs::FinalizeS3"() diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index e9655e4a7f74..b224abbead82 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -424,7 +424,7 @@ cdef class HadoopFileSystem: c_replication, c_default_block_size, &wr_handle)) - out.set_output_stream( wr_handle) + out.set_output_stream( wr_handle) out.is_writable = True else: with nogil: @@ -432,7 +432,7 @@ cdef class HadoopFileSystem: .OpenReadable(c_path, &rd_handle)) out.set_random_access_file( - rd_handle) + rd_handle) out.is_readable = True assert not out.closed diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 6ff1403006ef..0cdf21efc010 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -133,29 +133,29 @@ cdef class NativeFile: else: check_status(self.output_stream.get().Close()) - cdef set_random_access_file(self, shared_ptr[RandomAccessFile] handle): - self.input_stream = handle + cdef set_random_access_file(self, shared_ptr[CRandomAccessFile] handle): + self.input_stream = handle self.random_access = handle self.is_seekable = True - cdef set_input_stream(self, shared_ptr[InputStream] handle): + cdef set_input_stream(self, shared_ptr[CInputStream] handle): self.input_stream = handle self.random_access.reset() self.is_seekable = False - cdef set_output_stream(self, shared_ptr[OutputStream] handle): + cdef set_output_stream(self, shared_ptr[COutputStream] handle): self.output_stream = handle - cdef shared_ptr[RandomAccessFile] get_random_access_file(self) except *: + cdef shared_ptr[CRandomAccessFile] get_random_access_file(self) except *: self._assert_readable() self._assert_seekable() return self.random_access - cdef shared_ptr[InputStream] get_input_stream(self) except *: + cdef shared_ptr[CInputStream] get_input_stream(self) except *: self._assert_readable() return self.input_stream - cdef shared_ptr[OutputStream] get_output_stream(self) except *: + cdef shared_ptr[COutputStream] get_output_stream(self) except *: self._assert_writable() return self.output_stream @@ -679,11 +679,11 @@ cdef class PythonFile(NativeFile): if kind == 'r': self.set_random_access_file( - shared_ptr[RandomAccessFile](new PyReadableFile(handle))) + shared_ptr[CRandomAccessFile](new PyReadableFile(handle))) self.is_readable = True else: self.set_output_stream( - shared_ptr[OutputStream](new PyOutputStream(handle))) + shared_ptr[COutputStream](new PyOutputStream(handle))) self.is_writable = True def truncate(self, pos=None): @@ -720,8 +720,8 @@ cdef class MemoryMappedFile(NativeFile): result.path = path result.is_readable = True result.is_writable = True - result.set_output_stream( handle) - result.set_random_access_file( handle) + result.set_output_stream( handle) + result.set_random_access_file( handle) result.handle = handle return result @@ -750,8 +750,8 @@ cdef class MemoryMappedFile(NativeFile): with nogil: check_status(CMemoryMappedFile.Open(c_path, c_mode, &handle)) - self.set_output_stream( handle) - self.set_random_access_file( handle) + self.set_output_stream( handle) + self.set_random_access_file( handle) self.handle = handle def resize(self, new_size): @@ -836,7 +836,7 @@ cdef class OSFile(NativeFile): check_status(ReadableFile.Open(path, pool, &handle)) self.is_readable = True - self.set_random_access_file( handle) + self.set_random_access_file( handle) cdef _open_writable(self, c_string path): with nogil: @@ -1174,7 +1174,7 @@ cdef class BufferReader(NativeFile): def __cinit__(self, object obj): self.buffer = as_buffer(obj) - self.set_random_access_file(shared_ptr[RandomAccessFile]( + self.set_random_access_file(shared_ptr[CRandomAccessFile]( new CBufferReader(self.buffer.buffer))) self.is_readable = True @@ -1204,7 +1204,7 @@ cdef class CompressedInputStream(NativeFile): check_status(CCompressedInputStream.Make( codec.get(), stream.get_input_stream(), &compressed_stream)) - self.set_input_stream( compressed_stream) + self.set_input_stream( compressed_stream) self.is_readable = True @@ -1234,13 +1234,13 @@ cdef class CompressedOutputStream(NativeFile): check_status(CCompressedOutputStream.Make( codec.get(), stream.get_output_stream(), &compressed_stream)) - self.set_output_stream( compressed_stream) + self.set_output_stream( compressed_stream) self.is_writable = True ctypedef CBufferedInputStream* _CBufferedInputStreamPtr ctypedef CBufferedOutputStream* _CBufferedOutputStreamPtr -ctypedef RandomAccessFile* _RandomAccessFilePtr +ctypedef CRandomAccessFile* _RandomAccessFilePtr cdef class BufferedInputStream(NativeFile): @@ -1255,7 +1255,7 @@ cdef class BufferedInputStream(NativeFile): buffer_size, maybe_unbox_memory_pool(memory_pool), stream.get_input_stream(), &buffered_stream)) - self.set_input_stream( buffered_stream) + self.set_input_stream( buffered_stream) self.is_readable = True def detach(self): @@ -1269,7 +1269,7 @@ cdef class BufferedInputStream(NativeFile): The underlying raw input stream """ cdef: - shared_ptr[InputStream] c_raw + shared_ptr[CInputStream] c_raw _CBufferedInputStreamPtr buffered NativeFile raw @@ -1287,7 +1287,7 @@ cdef class BufferedInputStream(NativeFile): # selectively. if dynamic_cast[_RandomAccessFilePtr](c_raw.get()) != nullptr: raw.set_random_access_file( - static_pointer_cast[RandomAccessFile, InputStream](c_raw)) + static_pointer_cast[CRandomAccessFile, CInputStream](c_raw)) else: raw.set_input_stream(c_raw) return raw @@ -1305,7 +1305,7 @@ cdef class BufferedOutputStream(NativeFile): buffer_size, maybe_unbox_memory_pool(memory_pool), stream.get_output_stream(), &buffered_stream)) - self.set_output_stream( buffered_stream) + self.set_output_stream( buffered_stream) self.is_writable = True def detach(self): @@ -1319,7 +1319,7 @@ cdef class BufferedOutputStream(NativeFile): The underlying raw output stream """ cdef: - shared_ptr[OutputStream] c_raw + shared_ptr[COutputStream] c_raw _CBufferedOutputStreamPtr buffered NativeFile raw @@ -1400,7 +1400,7 @@ cdef NativeFile _get_native_file(object source, c_bool use_memory_map): cdef get_reader(object source, c_bool use_memory_map, - shared_ptr[RandomAccessFile]* reader): + shared_ptr[CRandomAccessFile]* reader): cdef NativeFile nf nf = _get_native_file(source, use_memory_map) @@ -1408,7 +1408,7 @@ cdef get_reader(object source, c_bool use_memory_map, cdef get_input_stream(object source, c_bool use_memory_map, - shared_ptr[InputStream]* out): + shared_ptr[CInputStream]* out): """ Like get_reader(), but can automatically decompress, and returns an InputStream. @@ -1416,7 +1416,7 @@ cdef get_input_stream(object source, c_bool use_memory_map, cdef: NativeFile nf unique_ptr[CCodec] codec - shared_ptr[InputStream] input_stream + shared_ptr[CInputStream] input_stream shared_ptr[CCompressedInputStream] compressed_stream CompressionType compression_type @@ -1435,12 +1435,12 @@ cdef get_input_stream(object source, c_bool use_memory_map, check_status(CCodec.Create(compression_type, &codec)) check_status(CCompressedInputStream.Make(codec.get(), input_stream, &compressed_stream)) - input_stream = compressed_stream + input_stream = compressed_stream out[0] = input_stream -cdef get_writer(object source, shared_ptr[OutputStream]* writer): +cdef get_writer(object source, shared_ptr[COutputStream]* writer): cdef NativeFile nf try: diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 3d8b3f4af4db..c9684f13e5bb 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -17,6 +17,7 @@ import warnings + cdef class Message: """ Container for an Arrow IPC message with metadata and optional body @@ -76,7 +77,7 @@ cdef class Message: """ cdef: int64_t output_length = 0 - OutputStream* out + COutputStream* out CIpcOptions options options.alignment = alignment @@ -136,9 +137,11 @@ cdef class MessageReader: @staticmethod def open_stream(source): - cdef MessageReader result = MessageReader.__new__(MessageReader) - cdef shared_ptr[InputStream] in_stream - cdef unique_ptr[CMessageReader] reader + cdef: + MessageReader result = MessageReader.__new__(MessageReader) + shared_ptr[CInputStream] in_stream + unique_ptr[CMessageReader] reader + _get_input_stream(source, &in_stream) with nogil: reader = CMessageReader.Open(in_stream) @@ -250,7 +253,7 @@ cdef class _CRecordBatchWriter: cdef class _RecordBatchStreamWriter(_CRecordBatchWriter): cdef: - shared_ptr[OutputStream] sink + shared_ptr[COutputStream] sink CIpcOptions options bint closed @@ -276,7 +279,7 @@ cdef class _RecordBatchStreamWriter(_CRecordBatchWriter): self.writer = GetResultValue(result) -cdef _get_input_stream(object source, shared_ptr[InputStream]* out): +cdef _get_input_stream(object source, shared_ptr[CInputStream]* out): try: source = as_buffer(source) except TypeError: @@ -332,7 +335,7 @@ cdef class _CRecordBatchReader: cdef class _RecordBatchStreamReader(_CRecordBatchReader): cdef: - shared_ptr[InputStream] in_stream + shared_ptr[CInputStream] in_stream cdef readonly: Schema schema @@ -367,7 +370,7 @@ cdef class _RecordBatchFileWriter(_RecordBatchStreamWriter): cdef class _RecordBatchFileReader: cdef: shared_ptr[CRecordBatchFileReader] reader - shared_ptr[RandomAccessFile] file + shared_ptr[CRandomAccessFile] file cdef readonly: Schema schema @@ -516,9 +519,8 @@ def read_tensor(source): """ cdef: shared_ptr[CTensor] sp_tensor - InputStream* c_stream - - cdef NativeFile nf = as_native_file(source) + CInputStream* c_stream + NativeFile nf = as_native_file(source) c_stream = nf.get_input_stream().get() with nogil: @@ -540,7 +542,7 @@ def read_message(source): """ cdef: Message result = Message.__new__(Message) - InputStream* c_stream + CInputStream* c_stream cdef NativeFile nf = as_native_file(source) c_stream = nf.get_input_stream().get() @@ -571,7 +573,7 @@ def read_schema(obj, DictionaryMemo dictionary_memo=None): """ cdef: shared_ptr[CSchema] result - shared_ptr[RandomAccessFile] cpp_file + shared_ptr[CRandomAccessFile] cpp_file CDictionaryMemo temp_memo CDictionaryMemo* arg_dict_memo diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 553227a4e559..571dfaa9449d 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -436,9 +436,9 @@ cdef class ResizableBuffer(Buffer): cdef class NativeFile: cdef: - shared_ptr[InputStream] input_stream - shared_ptr[RandomAccessFile] random_access - shared_ptr[OutputStream] output_stream + shared_ptr[CInputStream] input_stream + shared_ptr[CRandomAccessFile] random_access + shared_ptr[COutputStream] output_stream bint is_readable bint is_writable bint is_seekable @@ -449,13 +449,13 @@ cdef class NativeFile: # extension classes are technically virtual in the C++ sense) we can expose # the arrow::io abstract file interfaces to other components throughout the # suite of Arrow C++ libraries - cdef set_random_access_file(self, shared_ptr[RandomAccessFile] handle) - cdef set_input_stream(self, shared_ptr[InputStream] handle) - cdef set_output_stream(self, shared_ptr[OutputStream] handle) + cdef set_random_access_file(self, shared_ptr[CRandomAccessFile] handle) + cdef set_input_stream(self, shared_ptr[CInputStream] handle) + cdef set_output_stream(self, shared_ptr[COutputStream] handle) - cdef shared_ptr[RandomAccessFile] get_random_access_file(self) except * - cdef shared_ptr[InputStream] get_input_stream(self) except * - cdef shared_ptr[OutputStream] get_output_stream(self) except * + cdef shared_ptr[CRandomAccessFile] get_random_access_file(self) except * + cdef shared_ptr[CInputStream] get_input_stream(self) except * + cdef shared_ptr[COutputStream] get_output_stream(self) except * cdef class BufferedInputStream(NativeFile): @@ -485,10 +485,10 @@ cdef class _CRecordBatchReader: cdef get_input_stream(object source, c_bool use_memory_map, - shared_ptr[InputStream]* reader) + shared_ptr[CInputStream]* reader) cdef get_reader(object source, c_bool use_memory_map, - shared_ptr[RandomAccessFile]* reader) -cdef get_writer(object source, shared_ptr[OutputStream]* writer) + shared_ptr[CRandomAccessFile]* reader) +cdef get_writer(object source, shared_ptr[COutputStream]* writer) # Default is allow_none=False cdef DataType ensure_type(object type, c_bool allow_none=*) diff --git a/python/pyarrow/s3fs.py b/python/pyarrow/s3fs.py new file mode 100644 index 000000000000..5619e186f9ea --- /dev/null +++ b/python/pyarrow/s3fs.py @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from pyarrow._s3fs import ( # noqa + initialize_s3, + finalize_s3, + S3Options, + S3FileSystem +) + +initialize_s3() diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index fba834434d02..cf343b0ba320 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -254,11 +254,11 @@ cdef class SerializedPyObject: """ Write serialized object to a sink """ - cdef shared_ptr[OutputStream] stream + cdef shared_ptr[COutputStream] stream get_writer(sink, &stream) self._write_to(stream.get()) - cdef _write_to(self, OutputStream* stream): + cdef _write_to(self, COutputStream* stream): with nogil: check_status(self.data.WriteTo(stream)) @@ -399,7 +399,7 @@ def read_serialized(source, base=None): ------- serialized : the serialized data """ - cdef shared_ptr[RandomAccessFile] stream + cdef shared_ptr[CRandomAccessFile] stream get_reader(source, True, &stream) cdef SerializedPyObject serialized = SerializedPyObject() diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index bfb1e8e5d0b5..1b980080d50b 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -16,6 +16,9 @@ # under the License. import os +import subprocess +import tempfile + import pytest import hypothesis as h @@ -24,6 +27,8 @@ except ImportError: import pathlib2 as pathlib # py2 compat +from pyarrow.util import find_free_port + # setup hypothesis profiles h.settings.register_profile('ci', max_examples=1000) @@ -130,6 +135,12 @@ except ImportError: pass +try: + import pyarrow.s3fs # noqa + defaults['s3'] = True +except ImportError: + pass + def pytest_configure(config): for mark in groups: @@ -211,3 +222,47 @@ def tempdir(tmpdir): @pytest.fixture(scope='session') def datadir(): return pathlib.Path(__file__).parent / 'data' + + +try: + from tempfile import TemporaryDirectory +except ImportError: + import shutil + + class TemporaryDirectory(object): + """Temporary directory implementation for python 2""" + + def __enter__(self): + self.tmp = tempfile.mkdtemp() + return self.tmp + + def __exit__(self, exc_type, exc_value, traceback): + shutil.rmtree(self.tmp) + + +@pytest.mark.s3 +@pytest.fixture(scope='session') +def minio_server(): + host, port = 'localhost', find_free_port() + access_key, secret_key = 'arrow', 'apachearrow' + + address = '{}:{}'.format(host, port) + env = os.environ.copy() + env.update({ + 'MINIO_ACCESS_KEY': access_key, + 'MINIO_SECRET_KEY': secret_key + }) + + with TemporaryDirectory() as tempdir: + args = ['minio', '--compat', 'server', '--quiet', '--address', + address, tempdir] + proc = None + try: + proc = subprocess.Popen(args, env=env) + except IOError: + pytest.skip('`minio` command cannot be located') + else: + yield address, access_key, secret_key + finally: + if proc is not None: + proc.kill() diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 83afba6de65e..40099b06aec6 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -17,9 +17,7 @@ # under the License. import base64 -import contextlib import os -import socket import struct import tempfile import threading @@ -30,7 +28,7 @@ import pyarrow as pa from pyarrow.compat import tobytes -from pyarrow.util import pathlib +from pyarrow.util import pathlib, find_free_port try: from pyarrow import flight @@ -48,14 +46,6 @@ pytestmark = pytest.mark.flight -def find_free_port(): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - with contextlib.closing(sock) as sock: - sock.bind(('', 0)) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return sock.getsockname()[1] - - def test_import(): # So we see the ImportError somewhere import pyarrow.flight # noqa diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index f897e0d36f97..f6b6bf1d18cd 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -23,39 +23,109 @@ import pytest -from pyarrow import ArrowIOError +import pyarrow as pa +from pyarrow.tests.test_io import gzip_compress, gzip_decompress from pyarrow.fs import (FileType, Selector, FileSystem, LocalFileSystem, SubTreeFileSystem) -from pyarrow.tests.test_io import gzip_compress, gzip_decompress + + +@pytest.fixture +def localfs(request, tempdir): + return dict( + fs=LocalFileSystem(), + pathfn=lambda p: (tempdir / p).as_posix(), + allow_move_dir=True, + allow_append_to_file=True, + ) + + +@pytest.fixture +def subtree_localfs(request, tempdir, localfs): + prefix = 'subtree/prefix/' + (tempdir / prefix).mkdir(parents=True) + return dict( + fs=SubTreeFileSystem(prefix, localfs['fs']), + pathfn=prefix.__add__, + allow_move_dir=True, + allow_append_to_file=True, + ) + + +@pytest.mark.s3 +@pytest.fixture +def s3fs(request, minio_server): + from pyarrow.s3fs import S3Options, S3FileSystem + + address, access_key, secret_key = minio_server + bucket = 'pyarrow-filesystem/' + options = S3Options( + endpoint_override=address, + access_key=access_key, + secret_key=secret_key, + scheme='http' + ) + fs = S3FileSystem(options) + fs.create_dir(bucket) + + return dict( + fs=fs, + pathfn=bucket.__add__, + allow_move_dir=False, + allow_append_to_file=False, + ) + + +@pytest.fixture +def subtree_s3fs(request, s3fs): + prefix = 'pyarrow-filesystem/prefix/' + return dict( + fs=SubTreeFileSystem(prefix, s3fs['fs']), + pathfn=prefix.__add__, + allow_move_dir=False, + allow_append_to_file=False, + ) @pytest.fixture(params=[ pytest.param( - lambda tmp: LocalFileSystem(), - id='LocalFileSystem' + pytest.lazy_fixture('localfs'), + id='LocalFileSystem()' ), pytest.param( - lambda tmp: SubTreeFileSystem(tmp, LocalFileSystem()), - id='SubTreeFileSystem(LocalFileSystem)' + pytest.lazy_fixture('subtree_localfs'), + id='SubTreeFileSystem(LocalFileSystem())' + ), + pytest.param( + pytest.lazy_fixture('s3fs'), + id='S3FileSystem' + ), + pytest.param( + pytest.lazy_fixture('subtree_s3fs'), + id='SubTreeFileSystem(S3FileSystem())' ) ]) -def fs(request, tempdir): - return request.param(tempdir.as_posix()) +def filesystem_config(request): + return request.param + + +@pytest.fixture +def fs(request, filesystem_config): + return filesystem_config['fs'] @pytest.fixture -def testpath(request, fs, tempdir): - # we always use the tempdir for reading and writing test artifacts, but - # if the filesystem is wrapped in a SubTreeFileSystem then we don't need - # to prepend the path with the tempdir, we also test the API with both - # pathlib.Path objects and plain python strings - def convert(path): - if isinstance(fs, SubTreeFileSystem): - path = pathlib.Path(path) - else: - path = tempdir / path - return path.as_posix() - return convert +def pathfn(request, filesystem_config): + return filesystem_config['pathfn'] + + +@pytest.fixture +def allow_move_dir(request, filesystem_config): + return filesystem_config['allow_move_dir'] + + +@pytest.fixture +def allow_append_to_file(request, filesystem_config): + return filesystem_config['allow_append_to_file'] def test_cannot_instantiate_base_filesystem(): @@ -74,157 +144,158 @@ class Path: fs.create_dir(path) -def test_get_target_stats(fs, tempdir, testpath): - aaa, aaa_ = testpath('a/aa/aaa'), tempdir / 'a' / 'aa' / 'aaa' - bb, bb_ = testpath('a/bb'), tempdir / 'a' / 'bb' - c, c_ = testpath('c.txt'), tempdir / 'c.txt' - - aaa_.mkdir(parents=True) - bb_.touch() - c_.write_bytes(b'test') +def test_get_target_stats(fs, pathfn): + aaa = pathfn('a/aa/aaa/') + bb = pathfn('a/bb') + c = pathfn('c.txt') - def mtime_almost_equal(fs_dt, pathlib_ts): - # arrow's filesystem implementation truncates mtime to microsends - # resolution whereas pathlib rounds - pathlib_dt = datetime.utcfromtimestamp(pathlib_ts) - difference = (fs_dt - pathlib_dt).total_seconds() - return abs(difference) <= 10**-6 + fs.create_dir(aaa) + with fs.open_output_stream(bb): + pass # touch + with fs.open_output_stream(c) as fp: + fp.write(b'test') aaa_stat, bb_stat, c_stat = fs.get_target_stats([aaa, bb, c]) assert aaa_stat.path == aaa assert 'aaa' in repr(aaa_stat) - assert aaa_stat.base_name == 'aaa' assert aaa_stat.extension == '' - assert aaa_stat.type == FileType.Directory - assert mtime_almost_equal(aaa_stat.mtime, aaa_.stat().st_mtime) - with pytest.raises(ValueError): - aaa_stat.size + assert isinstance(aaa_stat.mtime, datetime) assert bb_stat.path == str(bb) assert bb_stat.base_name == 'bb' assert bb_stat.extension == '' assert bb_stat.type == FileType.File assert bb_stat.size == 0 - assert mtime_almost_equal(bb_stat.mtime, bb_.stat().st_mtime) + assert isinstance(bb_stat.mtime, datetime) assert c_stat.path == str(c) assert c_stat.base_name == 'c.txt' assert c_stat.extension == 'txt' assert c_stat.type == FileType.File assert c_stat.size == 4 - assert mtime_almost_equal(c_stat.mtime, c_.stat().st_mtime) - - -def test_get_target_stats_with_selector(fs, tempdir, testpath): - base_dir = testpath('.') - base_dir_ = tempdir - - selector = Selector(base_dir, allow_non_existent=False, recursive=True) - assert selector.base_dir == str(base_dir) - - (tempdir / 'test_file').touch() - (tempdir / 'test_directory').mkdir() - - stats = fs.get_target_stats(selector) - expected = list(base_dir_.iterdir()) - assert len(stats) == len(expected) - - for st in stats: - p = base_dir_ / st.path - if p.is_dir(): - assert st.type == FileType.Directory - if p.is_file(): - assert st.type == FileType.File - - -def test_create_dir(fs, tempdir, testpath): - directory = testpath('directory') - directory_ = tempdir / 'directory' - assert not directory_.exists() - fs.create_dir(directory) - assert directory_.exists() - - # recursive - directory = testpath('deeply/nested/directory') - directory_ = tempdir / 'deeply' / 'nested' / 'directory' - assert not directory_.exists() - with pytest.raises(ArrowIOError): - fs.create_dir(directory, recursive=False) - fs.create_dir(directory) - assert directory_.exists() - - -def test_delete_dir(fs, tempdir, testpath): - folder = testpath('directory') - nested = testpath('nested/directory') - folder_ = tempdir / 'directory' - nested_ = tempdir / 'nested' / 'directory' - - folder_.mkdir() - nested_.mkdir(parents=True) - - assert folder_.exists() - fs.delete_dir(folder) - assert not folder_.exists() - - assert nested_.exists() - fs.delete_dir(nested) - assert not nested_.exists() - - -def test_copy_file(fs, tempdir, testpath): - # copy file - source = testpath('source-file') - source_ = tempdir / 'source-file' - source_.touch() - target = testpath('target-file') - target_ = tempdir / 'target-file' - assert not target_.exists() - fs.copy_file(source, target) - assert source_.exists() - assert target_.exists() - - -def test_move(fs, tempdir, testpath): - # move directory - source = testpath('source-dir') - source_ = tempdir / 'source-dir' - source_.mkdir() - target = testpath('target-dir') - target_ = tempdir / 'target-dir' - assert not target_.exists() - fs.move(source, target) - assert not source_.exists() - assert target_.exists() - - # move file - source = testpath('source-file') - source_ = tempdir / 'source-file' - source_.touch() - target = testpath('target-file') - target_ = tempdir / 'target-file' - assert not target_.exists() - fs.move(source, target) - assert not source_.exists() - assert target_.exists() - - -def test_delete_file(fs, tempdir, testpath): - target = testpath('target-file') - target_ = tempdir / 'target-file' - target_.touch() - assert target_.exists() - fs.delete_file(target) - assert not target_.exists() - - nested = testpath('nested/target-file') - nested_ = tempdir / 'nested/target-file' - nested_.parent.mkdir() - nested_.touch() - assert nested_.exists() - fs.delete_file(nested) - assert not nested_.exists() + assert isinstance(c_stat.mtime, datetime) + + +def test_get_target_stats_with_selector(fs, pathfn): + base_dir = pathfn('selector-dir/') + file_a = pathfn('selector-dir/test_file_a') + file_b = pathfn('selector-dir/test_file_b') + dir_a = pathfn('selector-dir/test_dir_a') + + try: + fs.create_dir(base_dir) + with fs.open_output_stream(file_a): + pass + with fs.open_output_stream(file_b): + pass + fs.create_dir(dir_a) + + selector = Selector(base_dir, allow_non_existent=False, recursive=True) + assert selector.base_dir == base_dir + + stats = fs.get_target_stats(selector) + assert len(stats) == 3 + + for st in stats: + if st.path.endswith(file_a): + assert st.type == FileType.File + elif st.path.endswith(file_b): + assert st.type == FileType.File + elif st.path.endswith(dir_a): + assert st.type == FileType.Directory + else: + raise ValueError('unexpected path {}'.format(st.path)) + finally: + fs.delete_file(file_a) + fs.delete_file(file_b) + fs.delete_dir(dir_a) + fs.delete_dir(base_dir) + + +def test_create_dir(fs, pathfn): + d = pathfn('test-directory/') + + with pytest.raises(pa.ArrowIOError): + fs.delete_dir(d) + + fs.create_dir(d) + fs.delete_dir(d) + + d = pathfn('deeply/nested/test-directory/') + fs.create_dir(d, recursive=True) + fs.delete_dir(d) + + +def test_delete_dir(fs, pathfn): + d = pathfn('directory/') + nd = pathfn('directory/nested/') + + fs.create_dir(nd) + fs.delete_dir(nd) + fs.delete_dir(d) + with pytest.raises(pa.ArrowIOError): + fs.delete_dir(d) + + +def test_copy_file(fs, pathfn): + s = pathfn('test-copy-source-file') + t = pathfn('test-copy-target-file') + + with fs.open_output_stream(s): + pass + + fs.copy_file(s, t) + fs.delete_file(s) + fs.delete_file(t) + + +def test_move_directory(fs, pathfn, allow_move_dir): + # move directory (doesn't work with S3) + s = pathfn('source-dir/') + t = pathfn('target-dir/') + + fs.create_dir(s) + + if allow_move_dir: + fs.move(s, t) + with pytest.raises(pa.ArrowIOError): + fs.delete_dir(s) + fs.delete_dir(t) + else: + with pytest.raises(pa.ArrowIOError): + fs.move(s, t) + + +def test_move_file(fs, pathfn): + s = pathfn('test-move-source-file') + t = pathfn('test-move-target-file') + + with fs.open_output_stream(s): + pass + + fs.move(s, t) + with pytest.raises(pa.ArrowIOError): + fs.delete_file(s) + fs.delete_file(t) + + +def test_delete_file(fs, pathfn): + p = pathfn('test-delete-target-file') + with fs.open_output_stream(p): + pass + + fs.delete_file(p) + with pytest.raises(pa.ArrowIOError): + fs.delete_file(p) + + d = pathfn('test-delete-nested') + fs.create_dir(d) + f = pathfn('test-delete-nested/target-file') + with fs.open_output_stream(f) as s: + s.write(b'data') + + fs.delete_dir(d) def identity(v): @@ -240,27 +311,28 @@ def identity(v): ('gzip', 256, gzip_compress), ] ) -def test_open_input_stream(fs, tempdir, testpath, compression, buffer_size, - compressor): - file = testpath('abc') - file_ = tempdir / 'abc' - data = b'some data' * 1024 - file_.write_bytes(compressor(data)) +def test_open_input_stream(fs, pathfn, compression, buffer_size, compressor): + p = pathfn('open-input-stream') - with fs.open_input_stream(file, compression, buffer_size) as f: - result = f.read() + data = b'some data for reading\n' * 512 + with fs.open_output_stream(p) as s: + s.write(compressor(data)) + + with fs.open_input_stream(p, compression, buffer_size) as s: + result = s.read() assert result == data -def test_open_input_file(fs, tempdir, testpath): - file = testpath('abc') - file_ = tempdir / 'abc' +def test_open_input_file(fs, pathfn): + p = pathfn('open-input-file') + data = b'some data' * 1024 - file_.write_bytes(data) + with fs.open_output_stream(p) as s: + s.write(data) read_from = len(b'some data') * 512 - with fs.open_input_file(file) as f: + with fs.open_input_file(p) as f: f.seek(read_from) result = f.read() @@ -276,16 +348,16 @@ def test_open_input_file(fs, tempdir, testpath): ('gzip', 256, gzip_decompress), ] ) -def test_open_output_stream(fs, tempdir, testpath, compression, buffer_size, +def test_open_output_stream(fs, pathfn, compression, buffer_size, decompressor): - file = testpath('abc') - file_ = tempdir / 'abc' + p = pathfn('open-output-stream') - data = b'some data' * 1024 - with fs.open_output_stream(file, compression, buffer_size) as f: + data = b'some data for writing' * 1024 + with fs.open_output_stream(p, compression, buffer_size) as f: f.write(data) - assert decompressor(file_.read_bytes()) == data + with fs.open_input_stream(p, compression, buffer_size) as f: + assert f.read(len(data)) == data @pytest.mark.parametrize( @@ -297,13 +369,57 @@ def test_open_output_stream(fs, tempdir, testpath, compression, buffer_size, ('gzip', 256, gzip_compress, gzip_decompress), ] ) -def test_open_append_stream(fs, tempdir, testpath, compression, buffer_size, - compressor, decompressor): - file = testpath('abc') - file_ = tempdir / 'abc' - file_.write_bytes(compressor(b'already existing')) +def test_open_append_stream(fs, pathfn, compression, buffer_size, compressor, + decompressor, allow_append_to_file): + p = pathfn('open-append-stream') + + initial = compressor(b'already existing') + with fs.open_output_stream(p) as s: + s.write(initial) + + if allow_append_to_file: + with fs.open_append_stream(p, compression, buffer_size) as f: + f.write(b'\nnewly added') - with fs.open_append_stream(file, compression, buffer_size) as f: - f.write(b'\nnewly added') + with fs.open_input_stream(p) as f: + result = f.read() - assert decompressor(file_.read_bytes()) == b'already existing\nnewly added' + result = decompressor(result) + assert result == b'already existing\nnewly added' + else: + with pytest.raises(pa.ArrowNotImplementedError): + fs.open_append_stream(p, compression, buffer_size) + + +@pytest.mark.s3 +def test_s3_options(minio_server): + from pyarrow.s3fs import S3Options + + options = S3Options() + + assert options.region == 'us-east-1' + options.region = 'us-west-1' + assert options.region == 'us-west-1' + + assert options.scheme == 'https' + options.scheme = 'http' + assert options.scheme == 'http' + + assert options.endpoint_override == '' + options.endpoint_override = 'localhost:8999' + assert options.endpoint_override == 'localhost:8999' + + with pytest.raises(ValueError): + S3Options(access_key='access') + with pytest.raises(ValueError): + S3Options(secret_key='secret') + + address, access_key, secret_key = minio_server + options = S3Options( + access_key=access_key, + secret_key=secret_key, + endpoint_override=address, + scheme='http' + ) + assert options.scheme == 'http' + assert options.endpoint_override == address diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a2c6352bddaf..fb80020d7ead 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1843,18 +1843,41 @@ def test_filters_read_table(tempdir): assert table.num_rows == 3 -@pytest.yield_fixture -def s3_example(): - access_key = os.environ['PYARROW_TEST_S3_ACCESS_KEY'] - secret_key = os.environ['PYARROW_TEST_S3_SECRET_KEY'] - bucket_name = os.environ['PYARROW_TEST_S3_BUCKET'] +@pytest.fixture +def s3_bucket(request, minio_server): + boto3 = pytest.importorskip('boto3') + botocore = pytest.importorskip('botocore') + + address, access_key, secret_key = minio_server + s3 = boto3.resource( + 's3', + endpoint_url='http://{}'.format(address), + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + config=botocore.client.Config(signature_version='s3v4'), + region_name='us-east-1' + ) + bucket = s3.Bucket('test-s3fs') + bucket.create() + return 'test-s3fs' + - import s3fs - fs = s3fs.S3FileSystem(key=access_key, secret=secret_key) +@pytest.fixture +def s3_example(minio_server, s3_bucket): + s3fs = pytest.importorskip('s3fs') + + address, access_key, secret_key = minio_server + fs = s3fs.S3FileSystem( + key=access_key, + secret=secret_key, + client_kwargs={ + 'endpoint_url': 'http://{}'.format(address) + } + ) test_dir = guid() + bucket_uri = 's3://{0}/{1}'.format(s3_bucket, test_dir) - bucket_uri = 's3://{0}/{1}'.format(bucket_name, test_dir) fs.mkdir(bucket_uri) yield fs, bucket_uri fs.rm(bucket_uri, recursive=True) @@ -1920,23 +1943,29 @@ def _visit_level(base_dir, level, part_keys): for value in values: this_part_keys = part_keys + [(name, value)] - level_dir = base_dir / '{0}={1}'.format(name, value) + level_dir = fs._path_join( + str(base_dir), + '{0}={1}'.format(name, value) + ) fs.mkdir(level_dir) if level == DEPTH - 1: # Generate example data - file_path = level_dir / guid() - + file_path = fs._path_join(level_dir, guid()) filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) with fs.open(file_path, 'wb') as f: _write_table(part_table, f) assert fs.exists(file_path) - (level_dir / '_SUCCESS').touch() + file_success = fs._path_join(level_dir, '_SUCCESS') + with fs.open(file_success, 'wb') as f: + pass else: _visit_level(level_dir, level + 1, this_part_keys) - (level_dir / '_SUCCESS').touch() + file_success = fs._path_join(level_dir, '_SUCCESS') + with fs.open(file_success, 'wb') as f: + pass _visit_level(base_dir, 0, []) diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 5e4fb3579372..7219a447f356 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -19,8 +19,10 @@ from __future__ import absolute_import +import contextlib import functools import six +import socket import warnings @@ -125,3 +127,11 @@ def get_contiguous_span(shape, strides, itemsize): if end - start != itemsize * product(shape): raise ValueError('array data is non-contiguous') return start, end + + +def find_free_port(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + with contextlib.closing(sock) as sock: + sock.bind(('', 0)) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return sock.getsockname()[1] diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 73eabfebd288..10d445cbc442 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -3,4 +3,5 @@ hypothesis pandas pathlib2; python_version < "3.4" pytest +pytest-lazy-fixture pytz diff --git a/python/setup.py b/python/setup.py index 5e88352a8f9b..d7207eedd8f1 100755 --- a/python/setup.py +++ b/python/setup.py @@ -141,6 +141,8 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' + self.with_s3 = strtobool( + os.environ.get('PYARROW_WITH_S3', '0')) self.with_cuda = strtobool( os.environ.get('PYARROW_WITH_CUDA', '0')) self.with_flight = strtobool( @@ -176,6 +178,7 @@ def initialize_options(self): '_parquet', '_orc', '_plasma', + '_s3fs', 'gandiva'] def _run_cmake(self): @@ -215,6 +218,8 @@ def _run_cmake(self): if self.cmake_generator: cmake_options += ['-G', self.cmake_generator] + if self.with_s3: + cmake_options.append('-DPYARROW_BUILD_S3=on') if self.with_cuda: cmake_options.append('-DPYARROW_BUILD_CUDA=on') if self.with_flight: @@ -414,6 +419,8 @@ def _failure_permitted(self, name): return True if name == '_flight' and not self.with_flight: return True + if name == '_s3fs' and not self.with_s3: + return True if name == '_cuda' and not self.with_cuda: return True if name == 'gandiva' and not self.with_gandiva: From 3b262f628f360b0568ea175adebba96f1187579d Mon Sep 17 00:00:00 2001 From: "Jonathan A. Sternberg" Date: Tue, 1 Oct 2019 17:17:34 +0200 Subject: [PATCH 162/201] ARROW-6648: [Go] Expose the bitutil package Closes #5457 from jsternberg/master and squashes the following commits: 3914a141a ARROW-6648: Expose the bitutil package Authored-by: Jonathan A. Sternberg Signed-off-by: Sebastien Binet --- go/arrow/array/array.go | 2 +- go/arrow/array/array_test.go | 2 +- go/arrow/array/boolean.go | 2 +- go/arrow/array/booleanbuilder.go | 2 +- go/arrow/array/bufferbuilder.go | 2 +- go/arrow/array/bufferbuilder_numeric.gen.go | 2 +- go/arrow/array/builder.go | 2 +- go/arrow/array/decimal128.go | 2 +- go/arrow/array/fixed_size_list.go | 2 +- go/arrow/array/float16_builder.go | 2 +- go/arrow/array/interval.go | 2 +- go/arrow/array/list.go | 2 +- go/arrow/array/numericbuilder.gen.go | 2 +- go/arrow/array/struct.go | 2 +- go/arrow/{internal => }/bitutil/bitutil.go | 0 go/arrow/{internal => }/bitutil/bitutil_test.go | 2 +- go/arrow/ipc/file_reader.go | 2 +- go/arrow/ipc/file_writer.go | 2 +- go/arrow/ipc/writer.go | 2 +- go/arrow/type_traits_boolean.go | 2 +- 20 files changed, 19 insertions(+), 19 deletions(-) rename go/arrow/{internal => }/bitutil/bitutil.go (100%) rename go/arrow/{internal => }/bitutil/bitutil_test.go (99%) diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index c3577337a7b9..9fc4cc0be6db 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -20,7 +20,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" ) diff --git a/go/arrow/array/array_test.go b/go/arrow/array/array_test.go index af50cf955af3..48c238656b4a 100644 --- a/go/arrow/array/array_test.go +++ b/go/arrow/array/array_test.go @@ -298,4 +298,4 @@ func TestArraySliceTypes(t *testing.T) { } }) } -} \ No newline at end of file +} diff --git a/go/arrow/array/boolean.go b/go/arrow/array/boolean.go index 5095b17318ab..e352e6e043b3 100644 --- a/go/arrow/array/boolean.go +++ b/go/arrow/array/boolean.go @@ -21,7 +21,7 @@ import ( "strings" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/booleanbuilder.go b/go/arrow/array/booleanbuilder.go index 943df1f12007..4a38156849af 100644 --- a/go/arrow/array/booleanbuilder.go +++ b/go/arrow/array/booleanbuilder.go @@ -20,7 +20,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index 29d97fc8d41f..bcc7153b5598 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -19,7 +19,7 @@ package array import ( "sync/atomic" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/bufferbuilder_numeric.gen.go b/go/arrow/array/bufferbuilder_numeric.gen.go index e39b2528a9ef..4cdf42685f90 100644 --- a/go/arrow/array/bufferbuilder_numeric.gen.go +++ b/go/arrow/array/bufferbuilder_numeric.gen.go @@ -20,7 +20,7 @@ package array import ( "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go index 6f5beb355d3a..bdd871611afc 100644 --- a/go/arrow/array/builder.go +++ b/go/arrow/array/builder.go @@ -21,7 +21,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/decimal128.go b/go/arrow/array/decimal128.go index 390d87e9a496..3acf6b9c444d 100644 --- a/go/arrow/array/decimal128.go +++ b/go/arrow/array/decimal128.go @@ -22,8 +22,8 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/decimal128" - "github.com/apache/arrow/go/arrow/internal/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/fixed_size_list.go b/go/arrow/array/fixed_size_list.go index 6c61513cf0e1..b0a2ce75b978 100644 --- a/go/arrow/array/fixed_size_list.go +++ b/go/arrow/array/fixed_size_list.go @@ -22,7 +22,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/float16_builder.go b/go/arrow/array/float16_builder.go index 9c36dd172a18..80864279a061 100644 --- a/go/arrow/array/float16_builder.go +++ b/go/arrow/array/float16_builder.go @@ -20,8 +20,8 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/float16" - "github.com/apache/arrow/go/arrow/internal/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/interval.go b/go/arrow/array/interval.go index 21efd6e4078c..cfdd59379af0 100644 --- a/go/arrow/array/interval.go +++ b/go/arrow/array/interval.go @@ -22,7 +22,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" "github.com/pkg/errors" diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 2fccdeadb945..711e8497f6da 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -22,7 +22,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go index b113908af67c..5423d8ce515c 100644 --- a/go/arrow/array/numericbuilder.gen.go +++ b/go/arrow/array/numericbuilder.gen.go @@ -22,7 +22,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go index b70d9539a488..fd2294e0e468 100644 --- a/go/arrow/array/struct.go +++ b/go/arrow/array/struct.go @@ -22,7 +22,7 @@ import ( "sync/atomic" "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/debug" "github.com/apache/arrow/go/arrow/memory" ) diff --git a/go/arrow/internal/bitutil/bitutil.go b/go/arrow/bitutil/bitutil.go similarity index 100% rename from go/arrow/internal/bitutil/bitutil.go rename to go/arrow/bitutil/bitutil.go diff --git a/go/arrow/internal/bitutil/bitutil_test.go b/go/arrow/bitutil/bitutil_test.go similarity index 99% rename from go/arrow/internal/bitutil/bitutil_test.go rename to go/arrow/bitutil/bitutil_test.go index 62134dc64d45..fcb362f9daa2 100644 --- a/go/arrow/internal/bitutil/bitutil_test.go +++ b/go/arrow/bitutil/bitutil_test.go @@ -21,7 +21,7 @@ import ( "math/rand" "testing" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/testing/tools" "github.com/stretchr/testify/assert" ) diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 2b08b01240cb..54213ceced1a 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/flatbuf" "github.com/apache/arrow/go/arrow/memory" "github.com/pkg/errors" diff --git a/go/arrow/ipc/file_writer.go b/go/arrow/ipc/file_writer.go index 1fb6f529302e..8c6dec825c76 100644 --- a/go/arrow/ipc/file_writer.go +++ b/go/arrow/ipc/file_writer.go @@ -22,7 +22,7 @@ import ( "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/internal/flatbuf" "github.com/apache/arrow/go/arrow/memory" "github.com/pkg/errors" diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index 4ed0ff0b1ee1..3262083ce464 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -23,7 +23,7 @@ import ( "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" "github.com/apache/arrow/go/arrow/memory" "github.com/pkg/errors" ) diff --git a/go/arrow/type_traits_boolean.go b/go/arrow/type_traits_boolean.go index c30ae0fec4fe..a3a5c59257b1 100644 --- a/go/arrow/type_traits_boolean.go +++ b/go/arrow/type_traits_boolean.go @@ -17,7 +17,7 @@ package arrow import ( - "github.com/apache/arrow/go/arrow/internal/bitutil" + "github.com/apache/arrow/go/arrow/bitutil" ) type booleanTraits struct{} From 8231fcb428b2537ebaf635a56d11ae925f77d225 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Oct 2019 11:34:50 -0500 Subject: [PATCH 163/201] ARROW-5831: [Release] Add Python program to download binary artifacts in parallel, allow abort/resume This script only uses the Python standard library and curl. It does 8 downloads in parallel by default. Since Bintray returns sha256 checksums we compute these on any local files and do not re-download files, so that interrupted downloads can be resumed. Closes #5550 from wesm/parallel-rc-binary-verification and squashes the following commits: ff207e670 More robust python3 checking 1d78b9f41 Add Python-based parallel bintray artifact download script that can resume Authored-by: Wes McKinney Signed-off-by: Wes McKinney --- dev/release/download_rc_binaries.py | 161 ++++++++++++++++++++++++ dev/release/verify-release-candidate.sh | 42 +------ 2 files changed, 164 insertions(+), 39 deletions(-) create mode 100644 dev/release/download_rc_binaries.py diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py new file mode 100644 index 000000000000..01ab2c468eaf --- /dev/null +++ b/dev/release/download_rc_binaries.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys + +try: + import argparse + import concurrent.futures as cf + import functools + import hashlib + import json + import os + import subprocess + import urllib.request +except ImportError: + if sys.version_info.major < 3: + raise Exception("Please use Python 3 to run this script") + raise + + +BINTRAY_API_ROOT = "https://bintray.com/api/v1" +BINTRAY_DL_ROOT = "https://dl.bintray.com" +BINTRAY_REPO = 'apache/arrow' +DEFAULT_PARALLEL_DOWNLOADS = 8 + + +class Bintray: + + def __init__(self, repo=BINTRAY_REPO): + self.repo = repo + + def get_file_list(self, package, version): + url = os.path.join(BINTRAY_API_ROOT, 'packages', self.repo, package, + 'versions', version, 'files') + request = urllib.request.urlopen(url).read() + return json.loads(request) + + def download_files(self, files, dest=None, num_parallel=None): + """ + Download files from Bintray in parallel. If file already exists, will + overwrite if the checksum does not match what Bintray says it should be + + Parameters + ---------- + files : List[Dict] + File listing from Bintray + dest : str, default None + Defaults to current working directory + num_parallel : int, default 8 + Number of files to download in parallel. If set to None, uses + default + """ + if dest is None: + dest = os.getcwd() + if num_parallel is None: + num_parallel = DEFAULT_PARALLEL_DOWNLOADS + + if num_parallel == 1: + for path in files: + self._download_file(dest, path) + else: + parallel_map_terminate_early( + functools.partial(self._download_file, dest), + files, + num_parallel + ) + + def _download_file(self, dest, info): + relpath = info['path'] + + base, filename = os.path.split(relpath) + + dest_dir = os.path.join(dest, base) + os.makedirs(dest_dir, exist_ok=True) + + dest_path = os.path.join(dest_dir, filename) + + if os.path.exists(dest_path): + with open(dest_path, 'rb') as f: + sha256sum = hashlib.sha256(f.read()).hexdigest() + if sha256sum == info['sha256']: + print('Local file {} sha256 matches, skipping' + .format(dest_path)) + return + else: + print('Local file sha256 does not match, overwriting') + + print("Downloading {} to {}".format(relpath, dest_path)) + + bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath) + + cmd = [ + 'curl', '--fail', '--location', + '--output', dest_path, bintray_abspath + ] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + if proc.returncode != 0: + raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}" + .format(relpath, stdout, stderr)) + + +def parallel_map_terminate_early(f, iterable, num_parallel): + tasks = [] + with cf.ProcessPoolExecutor(num_parallel) as pool: + for v in iterable: + tasks.append(pool.submit(functools.partial(f, v))) + + for task in cf.as_completed(tasks): + if task.exception() is not None: + e = task.exception() + for task in tasks: + task.cancel() + raise e + + +ARROW_PACKAGE_TYPES = ['centos', 'debian', 'python', 'ubuntu'] + + +def download_rc_binaries(version, rc_number, dest=None, num_parallel=None): + bintray = Bintray() + + version_string = '{}-rc{}'.format(version, rc_number) + for package_type in ARROW_PACKAGE_TYPES: + files = bintray.get_file_list('{}-rc'.format(package_type), + version_string) + bintray.download_files(files, dest=dest, num_parallel=num_parallel) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Download release candidate binaries' + ) + parser.add_argument('version', type=str, help='The version number') + parser.add_argument('rc_number', type=int, + help='The release candidate number, e.g. 0, 1, etc') + parser.add_argument('--dest', type=str, default=os.getcwd(), + help='The output folder for the downloaded files') + parser.add_argument('--num_parallel', type=int, default=8, + help='The number of concurrent downloads to do') + args = parser.parse_args() + + download_rc_binaries(args.version, args.rc_number, dest=args.dest, + num_parallel=args.num_parallel) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index b1a5997de348..d09d13659eaf 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -99,49 +99,13 @@ fetch_archive() { shasum -a 512 -c ${dist_name}.tar.gz.sha512 } -bintray() { - local command=$1 - shift - local path=$1 - shift - local url=https://bintray.com/api/v1${path} - echo "${command} ${url}" 1>&2 - curl \ - --fail \ - --request ${command} \ - ${url} \ - "$@" | \ - jq . -} - -download_bintray_files() { - local target=$1 - - local version_name=${VERSION}-rc${RC_NUMBER} - - local file - bintray \ - GET /packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \ - jq -r ".[].path" | \ - while read file; do - mkdir -p "$(dirname ${file})" - curl \ - --fail \ - --location \ - --output ${file} \ - https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file} - done -} - test_binary() { local download_dir=binaries mkdir -p ${download_dir} - pushd ${download_dir} - # takes longer on slow network - for target in centos debian python ubuntu; do - download_bintray_files ${target} - done + python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir} + + pushd ${download_dir} # verify the signature and the checksums of each artifact find . -name '*.asc' | while read sigfile; do From 871aedbad26e05911b844d92197f975533c599bf Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 1 Oct 2019 23:59:57 +0200 Subject: [PATCH 164/201] ARROW-6751: [CI] Fix ccache setup on Travis-CI Closes #5554 from pitrou/ARROW-6751-travis-ccache and squashes the following commits: cbc10f7ea Again 08857c57e Again 4ea772d5e ARROW-6751: Fix ccache setup on Travis-CI Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .travis.yml | 43 ++++++++++++++++++++++++++++--------- ci/docker_build_cpp.sh | 4 ++++ ci/travis/before-install.sh | 5 +++++ 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 990fd2256674..534004d0d160 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,10 +26,6 @@ cache: - $HOME/.ccache - $HOME/.m2 # Maven -env: - - CCACHE_COMPRESS: 1 - - CCACHE_COMPRESSLEVEL: 5 - git: # ARROW-3083: We use a higher clone depth to also have the latest tag in the cloned history. depth: 250 @@ -40,6 +36,8 @@ before_install: matrix: fast_finish: true include: + + # ------------------------------------------------------------------------- - name: "Lint, Release tests" os: linux language: python @@ -60,6 +58,8 @@ matrix: - docker-compose build --pull cpp-system-deps script: - docker-compose run cpp-system-deps + + # ------------------------------------------------------------------------- # Separating Valgrind and C++ coverage makes individual jobs shorter - name: "C++ unit tests w/ conda-forge toolchain, coverage" compiler: gcc @@ -95,6 +95,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh || travis_terminate 1 + + # ------------------------------------------------------------------------- - name: "Python 3.6 unit tests w/ Valgrind, conda-forge toolchain, coverage" compiler: gcc language: cpp @@ -128,11 +130,12 @@ matrix: - export PLASMA_VALGRIND=1 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh + + # ------------------------------------------------------------------------- - name: "[OS X] C++ w/ XCode 9.3" compiler: clang osx_image: xcode9.3 os: osx - cache: addons: env: - ARROW_TRAVIS_USE_TOOLCHAIN=1 @@ -159,11 +162,12 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 # Disabled because of ARROW-4763 # - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh + + # ------------------------------------------------------------------------- - name: "[OS X] Python w/ XCode 9.3" compiler: clang osx_image: xcode9.3 os: osx - cache: addons: env: - ARROW_TRAVIS_S3=1 @@ -178,6 +182,8 @@ matrix: - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_minio.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 + + # ------------------------------------------------------------------------- - name: "Java OpenJDK8 and OpenJDK11" language: cpp os: linux @@ -186,6 +192,8 @@ matrix: - docker-compose build --pull java-all-jdks script: - docker-compose run java-all-jdks + + # ------------------------------------------------------------------------- - name: "Integration w/ OpenJDK 8, conda-forge toolchain" language: java os: linux @@ -213,6 +221,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_integration.sh # - $TRAVIS_BUILD_DIR/ci/travis_script_plasma_java_client.sh + + # ------------------------------------------------------------------------- - name: "NodeJS" language: node_js os: linux @@ -224,6 +234,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_js.sh + + # ------------------------------------------------------------------------- - name: "C++ & GLib & Ruby w/ gcc 5.4" compiler: gcc os: linux @@ -243,6 +255,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh + + # ------------------------------------------------------------------------- - name: "[OS X] C++ & GLib & Ruby w/ XCode 10.2 & Homebrew" compiler: clang osx_image: xcode10.2 @@ -256,7 +270,6 @@ matrix: - ARROW_TRAVIS_PYTHON=1 - ARROW_TRAVIS_VERBOSE=1 - ARROW_TRAVIS_MIMALLOC=1 - cache: addons: before_script: - if [ $ARROW_CI_RUBY_AFFECTED != "1" ]; then exit; fi @@ -267,6 +280,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh + + # ------------------------------------------------------------------------- - name: Rust language: rust cache: cargo @@ -293,6 +308,8 @@ matrix: - export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/cpp/submodules/parquet-testing/data - cargo tarpaulin --out Xml - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" + + # ------------------------------------------------------------------------- - name: Go language: go go_import_path: github.com/apache/arrow @@ -306,9 +323,14 @@ matrix: after_success: - pushd ${TRAVIS_BUILD_DIR}/go/arrow - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" + + # ------------------------------------------------------------------------- - name: R (with and without libarrow) language: r - cache: packages + cache: + directories: + - $HOME/.ccache + packages: true dist: xenial env: - ARROW_TRAVIS_PARQUET=1 @@ -318,6 +340,7 @@ matrix: # Have to copy-paste this here because of how R's build steps work - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` - if [ $ARROW_CI_R_AFFECTED != "1" ]; then exit; fi + - source ci/travis/before-install.sh # First check that it builds without libarrow - pushd ${TRAVIS_BUILD_DIR}/r - R -e 'install.packages("remotes"); remotes::install_deps(dep = TRUE)' @@ -348,10 +371,10 @@ matrix: - Rscript ../ci/travis_upload_r_coverage.R after_failure: - - source ci/travis/after-failure.sh + - source ${TRAVIS_BUILD_DIR}/ci/travis/after-failure.sh after_script: - - source ci/travis/after-script.sh + - source ${TRAVIS_BUILD_DIR}/ci/travis/after-script.sh env: global: diff --git a/ci/docker_build_cpp.sh b/ci/docker_build_cpp.sh index 29860bf7199a..8eb96a21bfd5 100755 --- a/ci/docker_build_cpp.sh +++ b/ci/docker_build_cpp.sh @@ -22,7 +22,11 @@ source_dir=${1:-/arrow/cpp} build_dir=${2:-/build/cpp} install_dir=${3:-${ARROW_HOME:-/usr/local}} +export CCACHE_COMPRESS=1 +export CCACHE_COMPRESSLEVEL=5 +export CCACHE_COMPILERCHECK=content export CCACHE_DIR=/build/ccache +export PATH=/usr/lib/ccache/:$PATH rm -rf ${build_dir} mkdir -p ${build_dir} diff --git a/ci/travis/before-install.sh b/ci/travis/before-install.sh index fd97b469341c..1eb45fd436b5 100755 --- a/ci/travis/before-install.sh +++ b/ci/travis/before-install.sh @@ -30,6 +30,11 @@ if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then echo -e 'Acquire::Retries 10; Acquire::http::Timeout \"20\";' | \ sudo tee /etc/apt/apt.conf.d/99-travis-retry sudo apt-get update -qq + + export CCACHE_COMPRESS=1 + export CCACHE_COMPRESSLEVEL=5 + export CCACHE_COMPILERCHECK=content + export PATH=/usr/lib/ccache/:$PATH ccache --show-stats fi From 9694200d2760f320cb662c069360f810eac86416 Mon Sep 17 00:00:00 2001 From: Bruce Mitchener Date: Tue, 1 Oct 2019 19:38:21 -0400 Subject: [PATCH 165/201] ARROW-6745: [Rust] Fix a variety of minor typos. Closes #5548 from waywardmonkeys/fix-typos and squashes the following commits: 0ee704fbb ARROW-6745: Fix a variety of minor typos. Authored-by: Bruce Mitchener Signed-off-by: Paddy Horan --- rust/README.md | 2 +- rust/arrow/README.md | 2 +- rust/arrow/src/compute/kernels/take.rs | 6 +++--- rust/arrow/src/csv/reader.rs | 2 +- rust/arrow/src/datatypes.rs | 2 +- rust/datafusion/README.md | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rust/README.md b/rust/README.md index 377cf6c6fef7..7b712511c14b 100644 --- a/rust/README.md +++ b/rust/README.md @@ -29,7 +29,7 @@ ## Prerequisites -Before running tests and examples it is necessary to set up the local development enviroment. +Before running tests and examples it is necessary to set up the local development environment. ### Git Submodules diff --git a/rust/arrow/README.md b/rust/arrow/README.md index 4bc6b062648e..7dffc3432024 100644 --- a/rust/arrow/README.md +++ b/rust/arrow/README.md @@ -99,7 +99,7 @@ cargo publish If the Cargo.toml does not have the correct version then it will be necessary to modify it manually. Since there is now a modified file locally that is not -committed to github it will be necessary to use the following command. +committed to GitHub it will be necessary to use the following command. ```bash cargo publish --allow-dirty diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index 6cce7fb47d95..b591982c56e5 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -359,7 +359,7 @@ mod tests { let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); - // construct a value aray with expected results: + // construct a value array with expected results: // [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] let expected_data = Int32Array::from(vec![ Some(2), @@ -424,7 +424,7 @@ mod tests { let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); - // construct a value aray with expected results: + // construct a value array with expected results: // [[null], null, [-1,-2,3], [5,null], [0,null,0]] let expected_data = Int32Array::from(vec![ None, @@ -487,7 +487,7 @@ mod tests { let a = take(&list_array, &index, None).unwrap(); let a: &ListArray = a.as_any().downcast_ref::().unwrap(); - // construct a value aray with expected results: + // construct a value array with expected results: // [null, null, [-1,-2,3], [5,null], [0,null,0]] let expected_data = Int32Array::from(vec![ Some(-1), diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index a031e199beff..15e0d33ae6d3 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -405,7 +405,7 @@ pub struct ReaderBuilder { /// If schema inference is run on a file with no headers, default column names /// are created. has_headers: bool, - /// An optional column delimiter. Defauits to `b','` + /// An optional column delimiter. Defaults to `b','` delimiter: Option, /// Optional maximum number of records to read during schema inference /// diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 5698ccbc1bd2..40e0153e81d8 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -785,7 +785,7 @@ impl Field { }) } - /// Converts to a `String` representation of the the `Field` + /// Converts to a `String` representation of the `Field` pub fn to_string(&self) -> String { format!("{}: {:?}", self.name, self.data_type) } diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md index 7058a3b1b433..f3dc512bb032 100644 --- a/rust/datafusion/README.md +++ b/rust/datafusion/README.md @@ -35,7 +35,7 @@ datafusion = "0.15.0-SNAPSHOT" #### Use as a bin ##### Build your own bin(requires rust toolchains) ```sh -git clone https://github/apache/arrow +git clone https://github.com/apache/arrow cd arrow/rust/datafusion cargo run --bin datafusion-cli ``` From d75d18674c9cf6066e30c0f82ba5c37d186d5184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 2 Oct 2019 15:36:16 +0900 Subject: [PATCH 166/201] ARROW-6730: [CI] Use GitHub Actions for "C++ with clang 7" docker image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5530 from fsaintjacques/github-action-docker and squashes the following commits: 4bb16ce1a Add GitHub Actions 648f2f093 Use expression c849d7719 Run all tests for push 473c1c2b7 GitHub 99a874846 Try github action Lead-authored-by: François Saint-Jacques Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/linux-docker-compose.yml | 47 ++++++++++++++++++ ci/detect-changes.py | 55 +++++++++++++++++++++- cpp/Dockerfile.ubuntu-bionic | 10 ++-- docker-compose.yml | 1 - 4 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/linux-docker-compose.yml diff --git a/.github/workflows/linux-docker-compose.yml b/.github/workflows/linux-docker-compose.yml new file mode 100644 index 000000000000..d1198572b554 --- /dev/null +++ b/.github/workflows/linux-docker-compose.yml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Linux docker-compose +on: + - push + - pull_request +jobs: + build: + name: Test + strategy: + matrix: + label: + - C++ w/ clang-7 & system packages + include: + - label: C++ w/ clang-7 & system packages + image: cpp-system-deps + skip_expression: | + ${ARROW_CI_CPP_AFFECTED} != "1" + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@master + with: + submodules: true + - name: docker-compose + run: | + eval "$(python ci/detect-changes.py)" + if [[ ${{ matrix.skip_expression }} ]]; then + exit + fi + + docker-compose build --pull ${{ matrix.image }} + docker-compose run ${{ matrix.image }} diff --git a/ci/detect-changes.py b/ci/detect-changes.py index d35d9b570b9a..424cd99d8663 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -129,6 +129,16 @@ def list_appveyor_affected_files(): return list_affected_files("{0}..HEAD".format(merge_base)) +def list_github_actions_affected_files(): + """ + Return a list of files affected in the current GitHub Actions build. + """ + # GitHub Actions checkout `refs/remotes/pull/$PR/merge` where `HEAD` points + # to the merge commit while `HEAD^` points to the commit before. Hence, + # `..HEAD^` points to all commit between master and the PR. + return list_affected_files("HEAD^..") + + LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', 'r', 'ruby', 'rust', 'csharp'] @@ -143,6 +153,9 @@ def list_appveyor_affected_files(): 'format': LANGUAGE_TOPICS, 'go': ['integration'], '.travis.yml': ALL_TOPICS, + # In theory, it should ignore CONTRIBUTING.md and ISSUE_TEMPLATE.md, but in + # practice it's going to be CI + '.github': ALL_TOPICS, 'c_glib': ['ruby'] } @@ -255,6 +268,23 @@ def run_from_appveyor(): return get_windows_shell_eval(make_env_for_topics(affected)) +def run_from_github(): + perr("Environment variables (excerpt):") + dump_env_vars('GITHUB_', '(REPOSITORY|ACTOR|SHA|REF|HEAD_REF|BASE_REF|EVENT_NAME)') + if os.environ['GITHUB_EVENT_NAME'] != 'pull_request': + # Not a PR build, test everything + affected = dict.fromkeys(ALL_TOPICS, True) + else: + affected_files = list_github_actions_affected_files() + perr("Affected files:", affected_files) + affected = get_affected_topics(affected_files) + assert set(affected) <= set(ALL_TOPICS), affected + + perr("Affected topics:") + perr(pprint.pformat(affected)) + return get_unix_shell_eval(make_env_for_topics(affected)) + + def test_get_affected_topics(): affected_topics = get_affected_topics(['cpp/CMakeLists.txt']) assert affected_topics == { @@ -290,6 +320,23 @@ def test_get_affected_topics(): 'dev': False } + affected_topics = get_affected_topics(['.github/workflows']) + assert affected_topics == { + 'c_glib': True, + 'cpp': True, + 'docs': True, + 'go': True, + 'java': True, + 'js': True, + 'python': True, + 'r': True, + 'ruby': True, + 'rust': True, + 'csharp': True, + 'integration': True, + 'dev': True, + } + if __name__ == "__main__": # This script should have its output evaluated by a shell, @@ -307,5 +354,11 @@ def test_get_affected_topics(): except Exception: print("exit 1") raise + elif os.environ.get('GITHUB_WORKFLOW'): + try: + print(run_from_github()) + except Exception: + print("exit 1") + raise else: - sys.exit("Script must be run under Travis-CI or AppVeyor") + sys.exit("Script must be run under Travis-CI, AppVeyor or GitHub Actions") diff --git a/cpp/Dockerfile.ubuntu-bionic b/cpp/Dockerfile.ubuntu-bionic index 763e13e50c87..325ebb7ee76b 100644 --- a/cpp/Dockerfile.ubuntu-bionic +++ b/cpp/Dockerfile.ubuntu-bionic @@ -71,12 +71,15 @@ RUN apt-get update -y -q && \ libgoogle-glog-dev \ liblz4-dev \ liblzma-dev \ + libprotobuf-dev \ + libprotoc-dev \ libre2-dev \ libsnappy-dev \ libssl-dev \ libzstd-dev \ ninja-build \ pkg-config \ + protobuf-compiler \ rapidjson-dev \ thrift-compiler \ tzdata && \ @@ -92,14 +95,11 @@ RUN apt-get update -y -q && \ ENV CMAKE_ARGS="-DThrift_SOURCE=BUNDLED \ -DFlatbuffers_SOURCE=BUNDLED \ -DGTest_SOURCE=BUNDLED \ --DORC_SOURCE=BUNDLED \ --Dc-ares_SOURCE=BUNDLED \ --DgRPC_SOURCE=BUNDLED \ --DProtobuf_SOURCE=BUNDLED ${CMAKE_ARGS}" +-DORC_SOURCE=BUNDLED" # Prioritize system packages and local installation ENV ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_FLIGHT=ON \ + ARROW_FLIGHT=OFF \ ARROW_GANDIVA=ON \ ARROW_HDFS=ON \ ARROW_ORC=ON \ diff --git a/docker-compose.yml b/docker-compose.yml index a5cb6061f617..0d0f6f9fc6bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,7 +131,6 @@ services: args: LLVM_VERSION: 7 environment: - ARROW_FLIGHT: "ON" ARROW_USE_ASAN: "ON" ARROW_USE_UBSAN: "ON" volumes: From e72a0da1498fad739c630b51449d76e82887a54e Mon Sep 17 00:00:00 2001 From: Sebastien Binet Date: Wed, 2 Oct 2019 10:32:42 +0200 Subject: [PATCH 167/201] =?UTF-8?q?ARROW-6752:=20[Go]=20make=20Null=20arra?= =?UTF-8?q?y=20implement=20Stringer,=20add=20tests=20for=20Nu=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ll arrays Closes #5555 from sbinet/issue-6752 and squashes the following commits: 18b5387cc ARROW-6752: make Null array implement Stringer, add tests for Null arrays Authored-by: Sebastien Binet Signed-off-by: Sebastien Binet --- go/arrow/array/compare_test.go | 14 ++++---- go/arrow/array/null.go | 14 ++++++++ go/arrow/internal/arrdata/arrdata.go | 52 ++++++++++++++++++++++++++++ go/arrow/internal/arrjson/arrjson.go | 13 +++++++ 4 files changed, 86 insertions(+), 7 deletions(-) diff --git a/go/arrow/array/compare_test.go b/go/arrow/array/compare_test.go index e9927f09f308..3ed326be1758 100644 --- a/go/arrow/array/compare_test.go +++ b/go/arrow/array/compare_test.go @@ -44,7 +44,7 @@ func TestArrayEqual(t *testing.T) { sub2 := array.NewSlice(arr, 0, int64(arr.Len()-1)) defer sub2.Release() - if array.ArrayEqual(sub1, sub2) { + if array.ArrayEqual(sub1, sub2) && name != "nulls" { t.Fatalf("non-identical arrays should not compare equal:\nsub1=%v\nsub2=%v\narrf=%v\n", sub1, sub2, arr) } }) @@ -73,7 +73,7 @@ func TestArraySliceEqual(t *testing.T) { sub2 := array.NewSlice(arr, 0, int64(arr.Len()-1)) defer sub2.Release() - if array.ArraySliceEqual(sub1, 0, int64(sub1.Len()), sub2, 0, int64(sub2.Len())) { + if array.ArraySliceEqual(sub1, 0, int64(sub1.Len()), sub2, 0, int64(sub2.Len())) && name != "nulls" { t.Fatalf("non-identical slices should not compare equal:\nsub1=%v\nsub2=%v\narrf=%v\n", sub1, sub2, arr) } }) @@ -99,7 +99,7 @@ func TestArrayApproxEqual(t *testing.T) { sub2 := array.NewSlice(arr, 0, int64(arr.Len()-1)) defer sub2.Release() - if array.ArrayApproxEqual(sub1, sub2) { + if array.ArrayApproxEqual(sub1, sub2) && name != "nulls" { t.Fatalf("non-identical arrays should not compare equal:\nsub1=%v\nsub2=%v\narrf=%v\n", sub1, sub2, arr) } }) @@ -489,7 +489,7 @@ func TestRecordEqual(t *testing.T) { t.Fatalf("identical records should compare equal:\nrecord:\n%v", rec0) } - if array.RecordEqual(rec0, rec1) { + if array.RecordEqual(rec0, rec1) && name != "nulls" { t.Fatalf("non-identical records should not compare equal:\nrec0:\n%v\nrec1:\n%v", rec0, rec1) } @@ -498,7 +498,7 @@ func TestRecordEqual(t *testing.T) { sub01 := rec0.NewSlice(1, recs[0].NumRows()) defer sub01.Release() - if array.RecordEqual(sub00, sub01) { + if array.RecordEqual(sub00, sub01) && name != "nulls" { t.Fatalf("non-identical records should not compare equal:\nsub0:\n%v\nsub1:\n%v", sub00, sub01) } }) @@ -514,7 +514,7 @@ func TestRecordApproxEqual(t *testing.T) { t.Fatalf("identical records should compare equal:\nrecord:\n%v", rec0) } - if array.RecordApproxEqual(rec0, rec1) { + if array.RecordApproxEqual(rec0, rec1) && name != "nulls" { t.Fatalf("non-identical records should not compare equal:\nrec0:\n%v\nrec1:\n%v", rec0, rec1) } @@ -523,7 +523,7 @@ func TestRecordApproxEqual(t *testing.T) { sub01 := rec0.NewSlice(1, recs[0].NumRows()) defer sub01.Release() - if array.RecordApproxEqual(sub00, sub01) { + if array.RecordApproxEqual(sub00, sub01) && name != "nulls" { t.Fatalf("non-identical records should not compare equal:\nsub0:\n%v\nsub1:\n%v", sub00, sub01) } }) diff --git a/go/arrow/array/null.go b/go/arrow/array/null.go index 346445d5bec9..56d7fa45fa18 100644 --- a/go/arrow/array/null.go +++ b/go/arrow/array/null.go @@ -17,6 +17,7 @@ package array import ( + "strings" "sync/atomic" "github.com/apache/arrow/go/arrow" @@ -53,6 +54,19 @@ func NewNullData(data *Data) *Null { return a } +func (a *Null) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + o.WriteString("(null)") + } + o.WriteString("]") + return o.String() +} + func (a *Null) setData(data *Data) { a.array.setData(data) a.array.nullBitmapBytes = nil diff --git a/go/arrow/internal/arrdata/arrdata.go b/go/arrow/internal/arrdata/arrdata.go index c24923631fa3..40dee3b54fd6 100644 --- a/go/arrow/internal/arrdata/arrdata.go +++ b/go/arrow/internal/arrdata/arrdata.go @@ -34,6 +34,7 @@ var ( ) func init() { + Records["nulls"] = makeNullRecords() Records["primitives"] = makePrimitiveRecords() Records["structs"] = makeStructsRecords() Records["lists"] = makeListsRecords() @@ -51,6 +52,49 @@ func init() { sort.Strings(RecordNames) } +func makeNullRecords() []array.Record { + mem := memory.NewGoAllocator() + + meta := arrow.NewMetadata( + []string{"k1", "k2", "k3"}, + []string{"v1", "v2", "v3"}, + ) + + schema := arrow.NewSchema( + []arrow.Field{ + arrow.Field{Name: "nulls", Type: arrow.Null, Nullable: true}, + }, &meta, + ) + + mask := []bool{true, false, false, true, true} + chunks := [][]array.Interface{ + []array.Interface{ + arrayOf(mem, []nullT{null, null, null, null, null}, mask), + }, + []array.Interface{ + arrayOf(mem, []nullT{null, null, null, null, null}, mask), + }, + []array.Interface{ + arrayOf(mem, []nullT{null, null, null, null, null}, mask), + }, + } + + defer func() { + for _, chunk := range chunks { + for _, col := range chunk { + col.Release() + } + } + }() + + recs := make([]array.Record, len(chunks)) + for i, chunk := range chunks { + recs[i] = array.NewRecord(schema, chunk, -1) + } + + return recs +} + func makePrimitiveRecords() []array.Record { mem := memory.NewGoAllocator() @@ -362,6 +406,7 @@ func makeStringsRecords() []array.Record { } type ( + nullT struct{} time32s arrow.Time32 time32ms arrow.Time32 time64ns arrow.Time64 @@ -372,6 +417,10 @@ type ( timestamp_ns arrow.Timestamp ) +var ( + null nullT +) + func makeFixedWidthTypesRecords() []array.Record { mem := memory.NewGoAllocator() schema := arrow.NewSchema( @@ -650,6 +699,9 @@ func arrayOf(mem memory.Allocator, a interface{}, valids []bool) array.Interface } switch a := a.(type) { + case []nullT: + return array.NewNull(len(a)) + case []bool: bldr := array.NewBooleanBuilder(mem) defer bldr.Release() diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 11cc3d0f2a1f..2b2bcaf92599 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -65,6 +65,8 @@ type dataType struct { func dtypeToJSON(dt arrow.DataType) dataType { switch dt := dt.(type) { + case *arrow.NullType: + return dataType{Name: "null"} case *arrow.BooleanType: return dataType{Name: "bool"} case *arrow.Int8Type: @@ -155,6 +157,8 @@ func dtypeToJSON(dt arrow.DataType) dataType { func dtypeFromJSON(dt dataType, children []Field) arrow.DataType { switch dt.Name { + case "null": + return arrow.Null case "bool": return arrow.FixedWidthTypes.Boolean case "int": @@ -364,6 +368,9 @@ func arraysToJSON(schema *arrow.Schema, arrs []array.Interface) []Array { func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) array.Interface { switch dt := dt.(type) { + case *arrow.NullType: + return array.NewNull(arr.Count) + case *arrow.BooleanType: bldr := array.NewBooleanBuilder(mem) defer bldr.Release() @@ -618,6 +625,12 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) array.Int func arrayToJSON(field arrow.Field, arr array.Interface) Array { switch arr := arr.(type) { + case *array.Null: + return Array{ + Name: field.Name, + Count: arr.Len(), + } + case *array.Boolean: return Array{ Name: field.Name, From 7d18c1cc307558423da144f31e65983b846e0f10 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 2 Oct 2019 13:37:29 +0200 Subject: [PATCH 168/201] ARROW-6750: [Python] Silence S3 error logs by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5553 from pitrou/ARROW-6750-python-silence-s3-error-logs and squashes the following commits: 33edb9dd2 ARROW-6750: Silence S3 error logs by default Authored-by: Antoine Pitrou Signed-off-by: Krisztián Szűcs --- python/pyarrow/_s3fs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index d1f820e51958..96b78f8b0a73 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -37,7 +37,7 @@ cpdef enum S3LogLevel: Trace = CS3LogLevel_Trace -def initialize_s3(S3LogLevel log_level=S3LogLevel.Error): +def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal): cdef CS3GlobalOptions options options.log_level = log_level check_status(CInitializeS3(options)) From 5f93f85ebd941f8b11e4633fa83b04447b31e2af Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 2 Oct 2019 16:15:37 +0200 Subject: [PATCH 169/201] ARROW-6755: [Release] Improve Windows release verification script * Improve performance by only building shared libraries * Use `--no-shortcuts` conda option to avoid annoying popups * Use ninja-build for better performance Closes #5559 from wesm/windows-rc-script-improvements and squashes the following commits: b0d3092b5 Do not build static libraries, avoid annoying UAC popup Authored-by: Wes McKinney Signed-off-by: Antoine Pitrou --- dev/release/verify-release-candidate.bat | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index 299297e95c54..cfbf56502ab8 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -41,11 +41,14 @@ set PYTHON=3.6 @rem Using call with conda.bat seems necessary to avoid terminating the batch @rem script execution -call conda create -p %_VERIFICATION_CONDA_ENV% -f -q -y python=%PYTHON% || exit /B +call conda create -p %_VERIFICATION_CONDA_ENV% ^ + --no-shortcuts -f -q -y python=%PYTHON% ^ + || exit /B call activate %_VERIFICATION_CONDA_ENV% || exit /B call conda install -y ^ + --no-shortcuts ^ python=3.7 ^ git ^ --file=ci\conda_env_cpp.yml ^ @@ -68,8 +71,9 @@ pushd %ARROW_SOURCE%\cpp\build @rem This is the path for Visual Studio Community 2017 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 -cmake -G "%GENERATOR%" ^ +cmake -G "Ninja" ^ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ + -DARROW_BUILD_STATIC=OFF ^ -DARROW_BOOST_USE_SHARED=ON ^ -DARROW_BUILD_TESTS=ON ^ -DGTest_SOURCE=BUNDLED ^ @@ -79,7 +83,12 @@ cmake -G "%GENERATOR%" ^ -DARROW_PYTHON=ON ^ -DARROW_PARQUET=ON ^ .. || exit /B -cmake --build . --target INSTALL --config %CONFIGURATION% || exit /B + +@rem NOTE(wesm): Building googletest is flaky for me with ninja. Building it +@rem first fixes the problem +ninja googletest_ep || exit /B + +ninja install || exit /B @rem Get testing datasets for Parquet unit tests git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing From b70f04a7abf5b7c74724faa9d834ebd0daf3f745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 2 Oct 2019 13:17:22 -0400 Subject: [PATCH 170/201] ARROW-6614: [C++][Dataset] Add DataSourceDiscovery class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DataSourceDiscovery is a factory-like interface to build DataSource. The class exists for the purpose of unifying/discovery schema of datasource before materializing it (since all DataSources in a Dataset must have a matching schema). - Add MockFileSystem::Make - Refactor FileSystemBaseDataSource to use fs::PathTree and support partition pruning. Closes #5529 from fsaintjacques/ARROW-6614-file-discovery and squashes the following commits: 31736072e Address comments 81d97aa90 Draft DataSourceDiscovery interface 38c2c1cc7 Add Schema inspection to FileFormat 802836d28 Refactor FileSystemBasedDataSource to use PathTree 0d739e915 Add MockFileSystem::Make utility Authored-by: François Saint-Jacques Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/dataset/CMakeLists.txt | 8 +- cpp/src/arrow/dataset/dataset.cc | 22 +-- cpp/src/arrow/dataset/dataset.h | 9 - cpp/src/arrow/dataset/discovery.cc | 91 ++++++++++ cpp/src/arrow/dataset/discovery.h | 69 +++++++- cpp/src/arrow/dataset/discovery_test.cc | 84 ++++++++++ cpp/src/arrow/dataset/file_base.cc | 131 ++++++++------- cpp/src/arrow/dataset/file_base.h | 61 ++++--- cpp/src/arrow/dataset/file_parquet.cc | 28 +++- cpp/src/arrow/dataset/file_parquet.h | 11 ++ cpp/src/arrow/dataset/file_parquet_test.cc | 27 ++- cpp/src/arrow/dataset/file_test.cc | 109 ++++++++---- cpp/src/arrow/dataset/filter.cc | 38 ++--- cpp/src/arrow/dataset/filter.h | 3 + cpp/src/arrow/dataset/scanner.h | 6 +- cpp/src/arrow/dataset/test_util.h | 177 ++++++++------------ cpp/src/arrow/filesystem/filesystem_test.cc | 13 ++ cpp/src/arrow/filesystem/mockfs.cc | 34 ++++ cpp/src/arrow/filesystem/mockfs.h | 9 + cpp/src/arrow/type.h | 1 + 20 files changed, 638 insertions(+), 293 deletions(-) create mode 100644 cpp/src/arrow/dataset/discovery.cc create mode 100644 cpp/src/arrow/dataset/discovery_test.cc diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt index 6f94297e272c..442e9e6587bc 100644 --- a/cpp/src/arrow/dataset/CMakeLists.txt +++ b/cpp/src/arrow/dataset/CMakeLists.txt @@ -23,7 +23,12 @@ arrow_install_all_headers("arrow/dataset") # pkg-config support arrow_add_pkg_config("arrow-dataset") -set(ARROW_DATASET_SRCS dataset.cc file_base.cc filter.cc scanner.cc) +set(ARROW_DATASET_SRCS + dataset.cc + discovery.cc + file_base.cc + filter.cc + scanner.cc) set(ARROW_DATASET_LINK_STATIC arrow_static) set(ARROW_DATASET_LINK_SHARED arrow_shared) @@ -91,6 +96,7 @@ endfunction() if(NOT WIN32) add_arrow_dataset_test(dataset_test) + add_arrow_dataset_test(discovery_test) add_arrow_dataset_test(file_test) add_arrow_dataset_test(filter_test) add_arrow_dataset_test(scanner_test) diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc index e3b81977e2b6..34b01a27c3e1 100644 --- a/cpp/src/arrow/dataset/dataset.cc +++ b/cpp/src/arrow/dataset/dataset.cc @@ -65,26 +65,28 @@ Status Dataset::NewScan(std::unique_ptr* out) { bool DataSource::AssumePartitionExpression( const std::shared_ptr& scan_options, std::shared_ptr* simplified_scan_options) const { - DCHECK_NE(simplified_scan_options, nullptr); - if (scan_options == nullptr) { - // null scan options; no selector to simplify - *simplified_scan_options = scan_options; + auto filter = scan_options->filter; + if (filter == nullptr || partition_expression_ == nullptr) { + if (simplified_scan_options != nullptr) { + *simplified_scan_options = scan_options; + } return true; } - auto c = SelectorAssume(scan_options->selector, partition_expression_); + auto c = filter->Assume(*partition_expression_); DCHECK_OK(c.status()); auto expr = std::move(c).ValueOrDie(); - bool trivial = true; - if (expr->IsNull() || (expr->IsTrivialCondition(&trivial) && !trivial)) { + if (expr->IsNull() || expr->IsTrivialFalseCondition()) { // selector is not satisfiable; yield no fragments return false; } - auto copy = std::make_shared(*scan_options); - copy->selector = ExpressionSelector(std::move(expr)); - *simplified_scan_options = std::move(copy); + if (simplified_scan_options != nullptr) { + auto copy = std::make_shared(*scan_options); + copy->filter = std::move(expr); + *simplified_scan_options = std::move(copy); + } return true; } diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index b6f7ca7106a7..7ff580706861 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -148,14 +148,5 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { std::vector> sources_; }; -/// \brief Conditions to apply to a dataset when reading to include or -/// exclude fragments, filter out rows, etc. -struct DataSelector { - std::vector> filters; - - // TODO(wesm): Select specific partition keys, file path globs, or - // other common desirable selections -}; - } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc new file mode 100644 index 000000000000..0a0b8d59f63a --- /dev/null +++ b/cpp/src/arrow/dataset/discovery.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/dataset/discovery.h" + +#include +#include +#include +#include +#include + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/filesystem/path_tree.h" +#include "arrow/status.h" + +namespace arrow { +namespace dataset { + +FileSystemDataSourceDiscovery::FileSystemDataSourceDiscovery( + fs::FileSystem* filesystem, std::vector files, + std::shared_ptr format) + : fs_(filesystem), files_(std::move(files)), format_(std::move(format)) {} + +Status FileSystemDataSourceDiscovery::Make(fs::FileSystem* filesystem, + std::vector files, + std::shared_ptr format, + std::shared_ptr* out) { + out->reset(new FileSystemDataSourceDiscovery(filesystem, files, format)); + return Status::OK(); +} + +Status FileSystemDataSourceDiscovery::Make(fs::FileSystem* filesystem, + fs::Selector selector, + std::shared_ptr format, + std::shared_ptr* out) { + std::vector files; + RETURN_NOT_OK(filesystem->GetTargetStats(selector, &files)); + return Make(filesystem, files, format, out); +} + +static inline Status InspectSchema(fs::FileSystem* fs, + const std::vector stats, + const std::shared_ptr& format, + std::shared_ptr* out) { + std::vector> schemas; + + for (const auto& f : stats) { + if (!f.IsFile()) continue; + + std::shared_ptr schema; + RETURN_NOT_OK(format->Inspect(FileSource(f.path(), fs), &schema)); + schemas.push_back(schema); + } + + if (schemas.size() > 0) { + // TODO merge schemas. + *out = schemas[0]; + } + + return Status::OK(); +} + +Status FileSystemDataSourceDiscovery::Inspect(std::shared_ptr* out) { + return InspectSchema(fs_, files_, format_, out); +} + +Status FileSystemDataSourceDiscovery::Build(const BuildOptions& options, + std::shared_ptr source_partition, + std::shared_ptr* out) { + return FileSystemBasedDataSource::Make(fs_, files_, std::move(source_partition), + PathPartitions{}, format_, out); +} + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h index 18242250f326..5f0387b23cd4 100644 --- a/cpp/src/arrow/dataset/discovery.h +++ b/cpp/src/arrow/dataset/discovery.h @@ -23,23 +23,80 @@ #include #include +#include #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" +#include "arrow/filesystem/filesystem.h" +#include "arrow/filesystem/path_tree.h" #include "arrow/util/macros.h" namespace arrow { namespace dataset { -struct ARROW_DS_EXPORT DiscoveryOptions { - std::shared_ptr format = NULLPTR; +struct ARROW_DS_EXPORT BuildOptions { + /// Schema to conform to. + std::shared_ptr schema = NULLPTR; + /// The partition scheme indicate how to discover partitions for the data + /// source and fragments. std::shared_ptr partition_scheme = NULLPTR; }; -/// \brief Using a root directory -ARROW_DS_EXPORT -Status DiscoverSource(const std::string& path, fs::FileSystem* filesystem, - const DiscoveryOptions& options, std::shared_ptr* out); +/// \brief DataSourceDiscovery provides a way to inspect a DataSource potential +/// schema before materializing it. Thus, the user can peek the schema for +/// data sources and decide on a unified schema. The pseudocode would look like +/// +/// def get_dataset(factories): +/// schemas = [] +/// for f in factories: +/// schemas.append(f.Inspect()) +/// +/// common_schema = UnifySchemas(schemas) +/// +/// sources = [] +/// for f in factories: +/// sources.append(f.Discover({schema: common_schema})) +/// +/// return Dataset(sources, common_schema) +class ARROW_DS_EXPORT DataSourceDiscovery { + public: + /// \brief Get the schema for the resulting DataSource. + virtual Status Inspect(std::shared_ptr* out) = 0; + + /// \brief Create a DataSource with a given partition. + virtual Status Build(const BuildOptions& options, + std::shared_ptr source_partition, + std::shared_ptr* out) = 0; + + virtual ~DataSourceDiscovery() = default; +}; + +/// \brief FileSystemDataSourceFactory creates a DataSource from a vector +/// of fs::FileStats or a fs::Selector. +class ARROW_DS_EXPORT FileSystemDataSourceDiscovery : public DataSourceDiscovery { + public: + static Status Make(fs::FileSystem* filesystem, std::vector files, + std::shared_ptr format, + std::shared_ptr* out); + + static Status Make(fs::FileSystem* filesystem, fs::Selector selector, + std::shared_ptr format, + std::shared_ptr* out); + + Status Inspect(std::shared_ptr* out) override; + + Status Build(const BuildOptions& options, std::shared_ptr source_partition, + std::shared_ptr* out) override; + + protected: + FileSystemDataSourceDiscovery(fs::FileSystem* filesystem, + std::vector files, + std::shared_ptr format); + + fs::FileSystem* fs_; + std::vector files_; + std::shared_ptr format_; +}; } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/discovery_test.cc b/cpp/src/arrow/dataset/discovery_test.cc new file mode 100644 index 000000000000..8dd3a2d3d920 --- /dev/null +++ b/cpp/src/arrow/dataset/discovery_test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/dataset/discovery.h" + +#include +#include + +#include "arrow/dataset/test_util.h" +#include "arrow/filesystem/test_util.h" + +namespace arrow { +namespace dataset { + +class FileSystemDataSourceDiscoveryTest : public TestFileSystemBasedDataSource { + public: + void MakeDiscovery(const std::vector& files) { + MakeFileSystem(files); + ASSERT_OK( + FileSystemDataSourceDiscovery::Make(fs_.get(), files, format_, &discovery_)); + } + + void MakeDiscovery(const std::vector& files, fs::Selector selector) { + MakeFileSystem(files); + ASSERT_OK( + FileSystemDataSourceDiscovery::Make(fs_.get(), selector, format_, &discovery_)); + } + + protected: + std::shared_ptr discovery_; + std::shared_ptr format_ = std::make_shared(); +}; + +TEST_F(FileSystemDataSourceDiscoveryTest, Basic) { + MakeDiscovery({fs::File("a"), fs::File("b")}); + + BuildOptions options; + ASSERT_OK(discovery_->Build(options, nullptr, &source_)); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b"}); +} + +TEST_F(FileSystemDataSourceDiscoveryTest, Selector) { + // This test ensure that the Selector is enforced. + fs::Selector selector; + selector.base_dir = "A"; + MakeDiscovery({fs::File("0"), fs::File("A/a")}, selector); + + BuildOptions options; + ASSERT_OK(discovery_->Build(options, nullptr, &source_)); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"A/a"}); +} + +TEST_F(FileSystemDataSourceDiscoveryTest, Inspect) { + auto s = schema({field("f64", float64())}); + format_ = std::make_shared(s); + + MakeDiscovery({}); + std::shared_ptr actual; + + // No files + ASSERT_OK(discovery_->Inspect(&actual)); + EXPECT_EQ(actual, nullptr); + + MakeDiscovery({fs::File("test")}); + ASSERT_OK(discovery_->Inspect(&actual)); + EXPECT_EQ(actual, s); +} + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index a335a2a679dd..4a75d650497d 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -48,79 +48,90 @@ Status FileBasedDataFragment::Scan(std::shared_ptr scan_context, } FileSystemBasedDataSource::FileSystemBasedDataSource( - fs::FileSystem* filesystem, const fs::Selector& selector, - std::shared_ptr format, std::shared_ptr partition_expression, - std::vector stats) - : DataSource(std::move(partition_expression)), + fs::FileSystem* filesystem, fs::PathForest forest, + std::shared_ptr source_partition, PathPartitions partitions, + std::shared_ptr format) + : DataSource(std::move(source_partition)), filesystem_(filesystem), - selector_(std::move(selector)), - format_(std::move(format)), - stats_(std::move(stats)) {} + forest_(std::move(forest)), + partitions_(std::move(partitions)), + format_(std::move(format)) {} Status FileSystemBasedDataSource::Make(fs::FileSystem* filesystem, - const fs::Selector& selector, + std::vector stats, + std::shared_ptr source_partition, + PathPartitions partitions, std::shared_ptr format, - std::shared_ptr partition_expression, - std::unique_ptr* out) { - std::vector stats; - RETURN_NOT_OK(filesystem->GetTargetStats(selector, &stats)); - - auto new_end = - std::remove_if(stats.begin(), stats.end(), [&](const fs::FileStats& stats) { - return stats.type() != fs::FileType::File || - !format->IsKnownExtension(stats.extension()); - }); - stats.resize(new_end - stats.begin()); - - out->reset(new FileSystemBasedDataSource(filesystem, selector, std::move(format), - std::move(partition_expression), - std::move(stats))); + std::shared_ptr* out) { + fs::PathForest forest; + RETURN_NOT_OK(fs::PathTree::Make(stats, &forest)); + out->reset(new FileSystemBasedDataSource(filesystem, std::move(forest), + std::move(source_partition), + std::move(partitions), std::move(format))); return Status::OK(); } -Status FileSystemBasedDataSource::Make(fs::FileSystem* filesystem, - const fs::Selector& selector, - std::shared_ptr format, - std::unique_ptr* out) { - return Make(filesystem, selector, std::move(format), nullptr, out); -} - DataFragmentIterator FileSystemBasedDataSource::GetFragmentsImpl( - std::shared_ptr scan_options) { - std::shared_ptr simplified_scan_options; - if (!AssumePartitionExpression(scan_options, &simplified_scan_options)) { - return MakeEmptyIterator>(); - } + std::shared_ptr options) { + std::vector> files; - struct Impl : DataFragmentIterator { - Impl(fs::FileSystem* filesystem, std::shared_ptr format, - std::shared_ptr scan_options, std::vector stats) - : filesystem_(filesystem), - format_(std::move(format)), - scan_options_(std::move(scan_options)), - stats_(std::move(stats)) {} - - Status Next(std::shared_ptr* out) { - if (i_ == stats_.size()) { - *out = nullptr; - return Status::OK(); - } - FileSource src(stats_[i_++].path(), filesystem_); - - std::unique_ptr fragment; - RETURN_NOT_OK(format_->MakeFragment(src, scan_options_, &fragment)); - *out = std::move(fragment); - return Status::OK(); + auto visitor = [&files](const fs::FileStats& stats) { + if (stats.IsFile()) { + files.emplace_back(new fs::FileStats(stats)); } + return Status::OK(); + }; + // The matcher ensures that directories (and their descendants) are not + // visited. + auto matcher = [this, options](const fs::FileStats& stats, bool* match) { + *match = this->PartitionMatches(stats, options->filter); + return Status::OK(); + }; + + for (auto tree : forest_) { + DCHECK_OK(tree->Visit(visitor, matcher)); + } + + auto file_it = MakeVectorIterator(std::move(files)); + auto file_to_fragment = [options, this](std::unique_ptr stats, + std::shared_ptr* out) { + std::unique_ptr fragment; + FileSource src(stats->path(), filesystem_); - size_t i_ = 0; - fs::FileSystem* filesystem_; - std::shared_ptr format_; - std::shared_ptr scan_options_; - std::vector stats_; + RETURN_NOT_OK(format_->MakeFragment(src, options, &fragment)); + *out = std::move(fragment); + return Status::OK(); }; - return DataFragmentIterator(Impl(filesystem_, format_, scan_options, stats_)); + return MakeMaybeMapIterator(file_to_fragment, std::move(file_it)); +} + +bool FileSystemBasedDataSource::PartitionMatches(const fs::FileStats& stats, + std::shared_ptr filter) { + if (filter == nullptr) { + return true; + } + + auto found = partitions_.find(stats.path()); + if (found == partitions_.end()) { + // No partition attached to current node (directory or file), continue. + return true; + } + + auto c = found->second->Assume(*filter); + if (!c.ok()) { + // Could not simplify expression move on! + return true; + } + + // TODO: pass simplified expressions to children + auto expr = std::move(c).ValueOrDie(); + if (expr->IsNull() || expr->IsTrivialFalseCondition()) { + // selector is not satisfiable; don't recurse in this branch. + return false; + } + + return true; } } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 440b36f75d95..c581251e4dc4 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -29,12 +30,16 @@ #include "arrow/dataset/visibility.h" #include "arrow/dataset/writer.h" #include "arrow/filesystem/filesystem.h" +#include "arrow/filesystem/path_tree.h" #include "arrow/io/file.h" #include "arrow/util/compression.h" namespace arrow { + namespace dataset { +class Filter; + /// \brief The path and filesystem where an actual file is located or a buffer which can /// be read like a file class ARROW_DS_EXPORT FileSource { @@ -127,6 +132,10 @@ class ARROW_DS_EXPORT FileFormat { /// \brief Return true if the given file extension virtual bool IsKnownExtension(const std::string& ext) const = 0; + /// \brief Return the schema of the file if possible. + virtual Status Inspect(const FileSource& source, + std::shared_ptr* out) const = 0; + /// \brief Open a file for scanning virtual Status ScanFile(const FileSource& source, std::shared_ptr scan_options, @@ -161,36 +170,46 @@ class ARROW_DS_EXPORT FileBasedDataFragment : public DataFragment { std::shared_ptr scan_options_; }; -/// \brief A DataSource which takes files of one format from a directory -/// -/// The directory is crawled upon construction (Make) and not updated afterward. -/// GetFragments() will not include files added after this DataDource is constructed and -/// will error if files are deleted/moved. +/// \brief Mapping from path to partition expressions. +using PathPartitions = std::unordered_map>; + +/// \brief A DataSource of FileBasedDataFragments. class ARROW_DS_EXPORT FileSystemBasedDataSource : public DataSource { public: - static Status Make(fs::FileSystem* filesystem, const fs::Selector& selector, - std::shared_ptr format, - std::unique_ptr* out); - - static Status Make(fs::FileSystem* filesystem, const fs::Selector& selector, - std::shared_ptr format, - std::shared_ptr partition_expression, - std::unique_ptr* out); - - std::string type() const override { return "directory"; } + /// \brief Create a FileSystemBasedDataSource with optional partitions. + /// + /// \param[in] filesystem the filesystem which files are from. + /// \param[in] stats a list of files/directories to consume. + /// \param[in] source_partition the top-level partition of the DataSource + /// \param[in] partitions optional partitions attached to FileStats found in + /// `stats`. + /// \param[in] format file format to create fragments from. + /// \param[out] out pointer storing the resulting DataSource. + /// + /// The caller is not required to provide a complete coverage of nodes and + /// partitions. + static Status Make(fs::FileSystem* filesystem, std::vector stats, + std::shared_ptr source_partition, + PathPartitions partitions, std::shared_ptr format, + std::shared_ptr* out); + + std::string type() const override { return "filesystem_data_source"; } protected: DataFragmentIterator GetFragmentsImpl(std::shared_ptr options) override; - FileSystemBasedDataSource(fs::FileSystem* filesystem, const fs::Selector& selector, - std::shared_ptr format, - std::shared_ptr partition_expression, - std::vector stats); + FileSystemBasedDataSource(fs::FileSystem* filesystem, fs::PathForest forest, + std::shared_ptr source_partition, + PathPartitions partitions, + std::shared_ptr format); + + bool PartitionMatches(const fs::FileStats& stats, std::shared_ptr filter); fs::FileSystem* filesystem_ = NULLPTR; - fs::Selector selector_; + fs::PathForest forest_; + PathPartitions partitions_; + std::shared_ptr format_; - std::vector stats_; }; } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index e6298231b89f..a23639ebd162 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -165,14 +165,26 @@ class ParquetScanTaskIterator { std::shared_ptr reader_; }; +Status ParquetFileFormat::Inspect(const FileSource& source, + std::shared_ptr* out) const { + auto pool = default_memory_pool(); + + std::unique_ptr reader; + RETURN_NOT_OK(OpenReader(source, pool, &reader)); + + std::unique_ptr arrow_reader; + RETURN_NOT_OK(parquet::arrow::FileReader::Make(pool, std::move(reader), &arrow_reader)); + + return arrow_reader->GetSchema(out); +} + Status ParquetFileFormat::ScanFile(const FileSource& source, std::shared_ptr scan_options, std::shared_ptr scan_context, ScanTaskIterator* out) const { - std::shared_ptr input; - RETURN_NOT_OK(source.Open(&input)); + std::unique_ptr reader; + RETURN_NOT_OK(OpenReader(source, scan_context->pool, &reader)); - auto reader = parquet::ParquetFileReader::Open(input); return ParquetScanTaskIterator::Make(scan_options, scan_context, std::move(reader), out); } @@ -185,5 +197,15 @@ Status ParquetFileFormat::MakeFragment(const FileSource& source, return Status::OK(); } +Status ParquetFileFormat::OpenReader( + const FileSource& source, MemoryPool* pool, + std::unique_ptr* out) const { + std::shared_ptr input; + RETURN_NOT_OK(source.Open(&input)); + + *out = parquet::ParquetFileReader::Open(input); + return Status::OK(); +} + } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index a04f4596f87f..cd3987845420 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -24,6 +24,10 @@ #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" +namespace parquet { +class ParquetFileReader; +} // namespace parquet + namespace arrow { namespace dataset { @@ -47,6 +51,9 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { return ext == "par" || ext == "parq" || ext == name(); } + /// \brief Return the schema of the file if possible. + Status Inspect(const FileSource& source, std::shared_ptr* out) const override; + /// \brief Open a file for scanning Status ScanFile(const FileSource& source, std::shared_ptr scan_options, std::shared_ptr scan_context, @@ -54,6 +61,10 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { Status MakeFragment(const FileSource& source, std::shared_ptr opts, std::unique_ptr* out) override; + + private: + Status OpenReader(const FileSource& source, MemoryPool* pool, + std::unique_ptr<::parquet::ParquetFileReader>* out) const; }; class ARROW_DS_EXPORT ParquetFragment : public FileBasedDataFragment { diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 171d8cee1a7f..ba390f2d2018 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -42,6 +42,8 @@ using parquet::CreateOutputStream; using parquet::arrow::FileWriter; using parquet::arrow::WriteTable; +using testing::Pointee; + Status WriteRecordBatch(const RecordBatch& batch, FileWriter* writer) { auto schema = batch.schema(); auto size = batch.num_rows(); @@ -138,9 +140,11 @@ class ParquetBufferFixtureMixin : public ArrowParquetWriterMixin { builder->UnsafeAppend(0.0); })); - auto schema_ = schema({field("f64", f64->type())}); return RecordBatch::Make(schema_, kBatchSize, {f64}); } + + protected: + std::shared_ptr schema_ = schema({field("f64", float64())}); }; class TestParquetFileFormat : public ParquetBufferFixtureMixin { @@ -172,21 +176,14 @@ TEST_F(TestParquetFileFormat, ScanRecordBatchReader) { ASSERT_EQ(row_count, kNumRows); } -class TestParquetFileSystemBasedDataSource - : public FileSystemBasedDataSourceMixin { - std::vector file_names() const override { - return {"a/b/c.parquet", "a/b/c/d.parquet", "a/b.parquet", "a.parquet"}; - } -}; - -TEST_F(TestParquetFileSystemBasedDataSource, NonRecursive) { this->NonRecursive(); } - -TEST_F(TestParquetFileSystemBasedDataSource, Recursive) { this->Recursive(); } - -TEST_F(TestParquetFileSystemBasedDataSource, DeletedFile) { this->DeletedFile(); } +TEST_F(TestParquetFileFormat, Inspect) { + auto reader = GetRecordBatchReader(); + auto source = GetFileSource(reader.get()); + auto format = ParquetFileFormat(); -TEST_F(TestParquetFileSystemBasedDataSource, PredicatePushDown) { - this->PredicatePushDown(); + std::shared_ptr actual; + ASSERT_OK(format.Inspect(*source.get(), &actual)); + EXPECT_EQ(*actual, *schema_); } } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc index 19e938a5e8b2..be6af4cbbfc0 100644 --- a/cpp/src/arrow/dataset/file_test.cc +++ b/cpp/src/arrow/dataset/file_test.cc @@ -15,23 +15,6 @@ // specific language governing permissions and limitations // under the License. -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - #include #include #include @@ -42,8 +25,8 @@ #include "arrow/dataset/api.h" #include "arrow/dataset/test_util.h" -#include "arrow/filesystem/localfs.h" #include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/test_util.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/io_util.h" @@ -94,21 +77,89 @@ TEST(FileSource, BufferBased) { ASSERT_EQ(Compression::LZ4, source2.compression()); } -class TestDummyFileSystemBasedDataSource - : public FileSystemBasedDataSourceMixin { - std::vector file_names() const override { - return {"a/b/c.dummy", "a/b/c/d.dummy", "a/b.dummy", "a.dummy"}; - } -}; +TEST_F(TestFileSystemBasedDataSource, Basic) { + MakeSource({}); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {}); + + MakeSource({fs::File("a"), fs::File("b"), fs::File("c")}); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b", "c"}); + + // Should not create fragment from directories. + MakeSource({fs::Dir("A"), fs::Dir("A/B"), fs::File("A/a"), fs::File("A/B/b")}); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"A/a", "A/B/b"}); +} + +TEST_F(TestFileSystemBasedDataSource, RootPartitionPruning) { + auto source_partition = ("a"_ == 5).Copy(); + MakeSource({fs::File("a"), fs::File("b")}, source_partition); + + // No filter should always return all data. + options_->filter = nullptr; + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b"}); -TEST_F(TestDummyFileSystemBasedDataSource, NonRecursive) { this->NonRecursive(); } + // filter == partition + options_->filter = source_partition; + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b"}); -TEST_F(TestDummyFileSystemBasedDataSource, Recursive) { this->Recursive(); } + // Same partition key, but non matching filter + options_->filter = ("a"_ == 6).Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {}); -TEST_F(TestDummyFileSystemBasedDataSource, DeletedFile) { this->DeletedFile(); } + options_->filter = ("a"_ > 1).Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b"}); + + // different key shouldn't prune + options_->filter = ("b"_ == 6).Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b"}); + + // No partition should match + MakeSource({fs::File("a"), fs::File("b")}); + options_->filter = ("b"_ == 6).Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {"a", "b"}); +} -TEST_F(TestDummyFileSystemBasedDataSource, PredicatePushDown) { - this->PredicatePushDown(); +TEST_F(TestFileSystemBasedDataSource, TreePartitionPruning) { + auto source_partition = ("country"_ == "US").Copy(); + std::vector regions = { + fs::Dir("NY"), fs::File("NY/New York"), fs::File("NY/Franklin"), + fs::Dir("CA"), fs::File("CA/San Francisco"), fs::File("CA/Franklin"), + }; + // Explicitly _don't_ set the state partition in the leaves to test if + // sub-tree pruning works. This implies that `state` predicate won't apply to + // files. + PathPartitions partitions = { + {"CA", ("state"_ == "CA").Copy()}, + {"CA/San Francisco", ("city"_ == "San Francisco").Copy()}, + {"CA/Franklin", ("city"_ == "Franklin").Copy()}, + {"NY", ("state"_ == "NY").Copy()}, + {"NY/New York", ("city"_ == "New York").Copy()}, + {"NY/Franklin", ("city"_ == "Franklin").Copy()}, + }; + + MakeSource(regions, source_partition, partitions); + + std::vector all_cities = {"CA/San Francisco", "CA/Franklin", "NY/New York", + "NY/Franklin"}; + std::vector ca_cities = {"CA/San Francisco", "CA/Franklin"}; + std::vector franklins = {"CA/Franklin", "NY/Franklin"}; + + // No filter should always return all data. + options_->filter = nullptr; + AssertFragmentsAreFromPath(source_->GetFragments(options_), all_cities); + + // Data source partition is respected + options_->filter = ("country"_ == "US").Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), all_cities); + options_->filter = ("country"_ == "FR").Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), {}); + + options_->filter = ("state"_ == "CA").Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), ca_cities); + + // Filter where no decisions can be made on inner nodes when filter don't + // apply to inner partitions. + options_->filter = ("city"_ == "Franklin").Copy(); + AssertFragmentsAreFromPath(source_->GetFragments(options_), franklins); } } // namespace dataset diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc index 7e1ee82b22b1..bcdee2897f1f 100644 --- a/cpp/src/arrow/dataset/filter.cc +++ b/cpp/src/arrow/dataset/filter.cc @@ -842,6 +842,16 @@ bool Expression::IsTrivialCondition(bool* out) const { return true; } +bool Expression::IsTrivialTrueCondition() const { + bool value = false; + return IsTrivialCondition(&value) && value; +} + +bool Expression::IsTrivialFalseCondition() const { + bool value = false; + return IsTrivialCondition(&value) && !value; +} + std::shared_ptr FieldExpression::Copy() const { return std::make_shared(*this); } @@ -939,33 +949,5 @@ Result> FieldExpression::Validate(const Schema& schema return null(); } -Result> SelectorAssume( - const std::shared_ptr& selector, - const std::shared_ptr& given) { - if (selector == nullptr || selector->filters.size() == 0) { - return ScalarExpression::Make(true); - } - - auto get_expression = [](const std::shared_ptr& f) { - DCHECK_EQ(f->type(), FilterType::EXPRESSION); - return checked_cast(*f).expression(); - }; - - auto out_expr = get_expression(selector->filters[0]); - for (size_t i = 1; i < selector->filters.size(); ++i) { - out_expr = and_(std::move(out_expr), get_expression(selector->filters[i])); - } - - if (given == nullptr) { - return std::move(out_expr); - } - return out_expr->Assume(*given); -} - -std::shared_ptr ExpressionSelector(std::shared_ptr e) { - return std::make_shared( - DataSelector{FilterVector{std::make_shared(std::move(e))}}); -} - } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/filter.h b/cpp/src/arrow/dataset/filter.h index 364fca0d2d4e..3f122c18ea4c 100644 --- a/cpp/src/arrow/dataset/filter.h +++ b/cpp/src/arrow/dataset/filter.h @@ -151,6 +151,9 @@ class ARROW_DS_EXPORT Expression { /// BooleanScalar. Its value may be retrieved at the same time. bool IsTrivialCondition(bool* value = NULLPTR) const; + bool IsTrivialTrueCondition() const; + bool IsTrivialFalseCondition() const; + /// Copy this expression into a shared pointer. virtual std::shared_ptr Copy() const = 0; diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index cfb661f4c0a7..7cde4ef2888b 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -38,16 +38,16 @@ class ARROW_DS_EXPORT ScanOptions { public: ScanOptions() = default; - ScanOptions(std::shared_ptr selector, std::shared_ptr schema, + ScanOptions(std::shared_ptr filter, std::shared_ptr schema, std::vector> options = {}) - : selector(std::move(selector)), schema(std::move(schema)) {} + : filter(std::move(filter)), schema(std::move(schema)) {} virtual ~ScanOptions() = default; MemoryPool* pool() const { return pool_; } // Filters - std::shared_ptr selector; + std::shared_ptr filter; // Schema to which record batches will be reconciled std::shared_ptr schema; diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 459185e527f3..1a7301b55187 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -15,16 +15,24 @@ // specific language governing permissions and limitations // under the License. +#pragma once + +#include #include #include #include #include #include +#include +#include + #include "arrow/dataset/file_base.h" #include "arrow/dataset/filter.h" #include "arrow/filesystem/localfs.h" +#include "arrow/filesystem/mockfs.h" #include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/test_util.h" #include "arrow/record_batch.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/io_util.h" @@ -155,7 +163,7 @@ class DatasetFixtureMixin : public ::testing::Test { } protected: - std::shared_ptr options_ = nullptr; + std::shared_ptr options_ = std::make_shared(); std::shared_ptr ctx_; }; @@ -164,117 +172,12 @@ class FileSystemBasedDataSourceMixin : public FileSourceFixtureMixin { public: virtual std::vector file_names() const = 0; - void SetUp() override { - selector_.base_dir = "/"; - selector_.recursive = true; - - format_ = std::make_shared(); - schema_ = schema({field("dummy", null())}); - options_ = std::make_shared(); - - ASSERT_OK( - TemporaryDir::Make("test-fsdatasource-" + format_->name() + "-", &temp_dir_)); - local_fs_ = std::make_shared(); - - auto path = temp_dir_->path().ToString(); - fs_ = std::make_shared(path, local_fs_); - - for (auto path : file_names()) { - CreateFile(path, ""); - } - - partition_expression_ = ScalarExpression::Make(true); - } - - void CreateFile(std::string path, std::string contents) { - auto parent = fs::internal::GetAbstractPathParent(path).first; - if (parent != "") { - ASSERT_OK(this->fs_->CreateDir(parent, true)); - } - std::shared_ptr file; - ASSERT_OK(this->fs_->OpenOutputStream(path, &file)); - ASSERT_OK(file->Write(contents)); - } - - void MakeDataSource() { - ASSERT_OK(FileSystemBasedDataSource::Make(fs_.get(), selector_, format_, - partition_expression_, &source_)); - } - - protected: - std::function fragment)> OpenFragments( - size_t* count) { - return [this, count](std::shared_ptr fragment) { - auto file_fragment = - internal::checked_pointer_cast(fragment); - ++*count; - auto extension = - fs::internal::GetAbstractPathExtension(file_fragment->source().path()); - EXPECT_TRUE(format_->IsKnownExtension(extension)); - std::shared_ptr f; - return this->fs_->OpenInputFile(file_fragment->source().path(), &f); - }; - } - - void NonRecursive() { - selector_.recursive = false; - MakeDataSource(); - - size_t count = 0; - ASSERT_OK(source_->GetFragments(options_).Visit(OpenFragments(&count))); - ASSERT_EQ(count, 1); - } - - void Recursive() { - MakeDataSource(); - - size_t count = 0; - ASSERT_OK(source_->GetFragments(options_).Visit(OpenFragments(&count))); - ASSERT_EQ(count, file_names().size()); - } - - void DeletedFile() { - MakeDataSource(); - ASSERT_GT(file_names().size(), 0); - ASSERT_OK(this->fs_->DeleteFile(file_names()[0])); - - size_t count = 0; - ASSERT_RAISES(IOError, source_->GetFragments(options_).Visit(OpenFragments(&count))); - } - - void PredicatePushDown() { - partition_expression_ = equal(field_ref("alpha"), ScalarExpression::Make(3)); - MakeDataSource(); - - options_->selector = std::make_shared(); - options_->selector->filters.resize(1); - - // with a filter identical to the partition condition, all fragments are yielded - options_->selector->filters[0] = - std::make_shared(partition_expression_->Copy()); - - size_t count = 0; - // ASSERT_OK(source_->GetFragments(context_)->Visit(OpenFragments(&count))); - // ASSERT_EQ(count, file_names().size()); - - // with a filter which contradicts the partition condition, no fragments are yielded - options_->selector->filters[0] = std::make_shared( - equal(field_ref("alpha"), ScalarExpression::Make(0))); - - count = 0; - ASSERT_OK(source_->GetFragments(options_).Visit(OpenFragments(&count))); - ASSERT_EQ(count, 0); - } - fs::Selector selector_; - std::unique_ptr source_; - std::shared_ptr local_fs_; + std::unique_ptr source_; std::shared_ptr fs_; - std::unique_ptr temp_dir_; std::shared_ptr format_; std::shared_ptr schema_; - std::shared_ptr options_; - std::shared_ptr partition_expression_; + std::shared_ptr options_ = std::make_shared(); }; template @@ -286,11 +189,19 @@ std::unique_ptr> MakeGeneratedRecordBatch( /// \brief A dummy FileFormat implementation class DummyFileFormat : public FileFormat { public: + explicit DummyFileFormat(std::shared_ptr schema = NULLPTR) + : schema_(std::move(schema)) {} + std::string name() const override { return "dummy"; } /// \brief Return true if the given file extension bool IsKnownExtension(const std::string& ext) const override { return ext == name(); } + Status Inspect(const FileSource& source, std::shared_ptr* out) const override { + *out = schema_; + return Status::OK(); + } + /// \brief Open a file for scanning (always returns an empty iterator) Status ScanFile(const FileSource& source, std::shared_ptr scan_options, std::shared_ptr scan_context, @@ -302,6 +213,9 @@ class DummyFileFormat : public FileFormat { inline Status MakeFragment(const FileSource& location, std::shared_ptr opts, std::unique_ptr* out) override; + + protected: + std::shared_ptr schema_; }; class DummyFragment : public FileBasedDataFragment { @@ -319,5 +233,52 @@ Status DummyFileFormat::MakeFragment(const FileSource& source, return Status::OK(); } +class TestFileSystemBasedDataSource : public ::testing::Test { + public: + void SetUp() { options_ = std::make_shared(); } + + void MakeFileSystem(const std::vector& stats) { + ASSERT_OK(fs::internal::MockFileSystem::Make(fs::kNoTime, stats, &fs_)); + } + + void MakeFileSystem(const std::vector& paths) { + std::vector stats{paths.size()}; + std::transform(paths.cbegin(), paths.cend(), stats.begin(), + [](const std::string& p) { return fs::File(p); }); + + ASSERT_OK(fs::internal::MockFileSystem::Make(fs::kNoTime, stats, &fs_)); + } + + void MakeSource(const std::vector& stats, + std::shared_ptr source_partition = nullptr, + PathPartitions partitions = {}) { + MakeFileSystem(stats); + auto format = std::make_shared(); + ASSERT_OK(FileSystemBasedDataSource::Make(fs_.get(), stats, source_partition, + partitions, format, &source_)); + } + + protected: + std::shared_ptr fs_; + std::shared_ptr source_; + std::shared_ptr options_; +}; + +void AssertFragmentsAreFromPath(DataFragmentIterator it, + std::vector expected) { + std::vector actual; + + auto v = [&actual](std::shared_ptr fragment) -> Status { + EXPECT_NE(fragment, nullptr); + auto dummy = std::static_pointer_cast(fragment); + actual.push_back(dummy->source().path()); + return Status::OK(); + }; + + ASSERT_OK(it.Visit(v)); + // Ordering is not guaranteed. + EXPECT_THAT(actual, testing::UnorderedElementsAreArray(expected)); +} + } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc index c199a2dde2cb..0b183550efff 100644 --- a/cpp/src/arrow/filesystem/filesystem_test.cc +++ b/cpp/src/arrow/filesystem/filesystem_test.cc @@ -365,6 +365,19 @@ TEST_F(TestMockFS, OpenAppendStream) { CheckFiles({{"ab", time_, "some data"}}); } +TEST_F(TestMockFS, Make) { + std::shared_ptr fs; + ASSERT_OK(MockFileSystem::Make(time_, {}, &fs)); + fs_ = std::static_pointer_cast(fs); + CheckDirs({}); + CheckFiles({}); + + ASSERT_OK(MockFileSystem::Make(time_, {Dir("A/B/C"), File("A/a")}, &fs)); + fs_ = std::static_pointer_cast(fs); + CheckDirs({{"A", time_}, {"A/B", time_}, {"A/B/C", time_}}); + CheckFiles({{"A/a", time_, ""}}); +} + //////////////////////////////////////////////////////////////////////////// // Concrete SubTreeFileSystem tests diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index a6ff7c78d6f8..c813355c818f 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -656,6 +656,40 @@ std::vector MockFileSystem::AllFiles() { return result; } +Status MockFileSystem::CreateFile(const std::string& path, const std::string& contents, + bool recursive) { + auto parent = fs::internal::GetAbstractPathParent(path).first; + if (parent != "") { + RETURN_NOT_OK(CreateDir(parent, recursive)); + } + + std::shared_ptr file; + RETURN_NOT_OK(OpenOutputStream(path, &file)); + RETURN_NOT_OK(file->Write(contents)); + return file->Close(); +} + +Status MockFileSystem::Make(TimePoint current_time, const std::vector& stats, + std::shared_ptr* out) { + auto fs = std::make_shared(current_time); + for (const auto& s : stats) { + switch (s.type()) { + case FileType::Directory: + RETURN_NOT_OK(fs->CreateDir(s.path(), /*recursive*/ true)); + break; + case FileType::File: + RETURN_NOT_OK(fs->CreateFile(s.path(), "", /*recursive*/ true)); + break; + default: + break; + } + } + + *out = fs; + + return Status::OK(); +} + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/mockfs.h b/cpp/src/arrow/filesystem/mockfs.h index faafe9a3bccd..e23d3029d9c4 100644 --- a/cpp/src/arrow/filesystem/mockfs.h +++ b/cpp/src/arrow/filesystem/mockfs.h @@ -94,6 +94,15 @@ class ARROW_EXPORT MockFileSystem : public FileSystem { std::vector AllDirs(); std::vector AllFiles(); + // Create a File with a content from a string. + Status CreateFile(const std::string& path, const std::string& content, + bool recursive = true); + + // Create a MockFileSystem out of (empty) FileStats. The content of every + // file is empty and of size 0. All directories will be created recursively. + static Status Make(TimePoint current_time, const std::vector& stats, + std::shared_ptr* out); + class Impl; protected: diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 60cae37e6360..72b6ce405223 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1317,6 +1317,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable { /// Returns true if all of the schema fields are equal bool Equals(const Schema& other, bool check_metadata = true) const; + bool operator==(const Schema& other) const { return Equals(other); } /// \brief Return the number of fields (columns) in the schema int num_fields() const; From 48b56bd593909832aa37a58681e80cfd4e041d53 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 2 Oct 2019 19:41:57 +0200 Subject: [PATCH 171/201] ARROW-6581: [C++] Fix fuzzit job submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also make it faster (we don't need Gandiva here). Closes #5407 from pitrou/ARROW-6581-fix-fuzzit-job and squashes the following commits: 2628a74f8 ARROW-6581: Fix fuzzit job submission Authored-by: Antoine Pitrou Signed-off-by: Krisztián Szűcs --- ci/docker_build_and_fuzzit.sh | 16 +++++++++------- docker-compose.yml | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/ci/docker_build_and_fuzzit.sh b/ci/docker_build_and_fuzzit.sh index 18bd0d4c599e..b3bf1bd81fed 100755 --- a/ci/docker_build_and_fuzzit.sh +++ b/ci/docker_build_and_fuzzit.sh @@ -24,13 +24,14 @@ export ARROW_USE_ASAN="ON" export CC="clang-7" export CXX="clang++-7" export ARROW_BUILD_TYPE="RelWithDebInfo" +export ARROW_FLIGHT="OFF" +export ARROW_GANDIVA="OFF" export ARROW_ORC="OFF" export ARROW_PARQUET="OFF" export ARROW_PLASMA="OFF" -export ARROW_FLIGHT="OFF" -export ARROW_BUILD_BENCHMARKS="OFF" export ARROW_WITH_BZ2="OFF" export ARROW_WITH_ZSTD="OFF" +export ARROW_BUILD_BENCHMARKS="OFF" export ARROW_BUILD_UTILITIES="OFF" /arrow/ci/docker_build_cpp.sh || exit 1 pushd /build/cpp @@ -40,11 +41,12 @@ cp ./relwithdebinfo/arrow-ipc-fuzzing-test ./relwithdebinfo/out/fuzzer ldd ./relwithdebinfo/arrow-ipc-fuzzing-test | grep "=> /" | awk '{print $3}' | xargs -I '{}' cp -v '{}' ./relwithdebinfo/out/. cd ./relwithdebinfo/out/ tar -czvf fuzzer.tar.gz * +stat fuzzer.tar.gz cd ../../ -export TARGET_ID=u79f6bXYgNH4NkU99iWK -export FUZZIT_API_KEY=${FUZZIT_API_KEY:-ac6089a1bc2313679f2d99bb80553162c380676bff3f094de826b16229e28184a8084b86f52c95112bde6b3dbb07b9b7} -wget -O fuzzit https://bin.fuzzit.dev/fuzzit-1.1 +export TARGET_ID=apache-arrow/arrow-ipc-fuzzing + +wget -O fuzzit https://github.com/fuzzitdev/fuzzit/releases/latest/download/fuzzit_Linux_x86_64 chmod a+x fuzzit -./fuzzit auth $FUZZIT_API_KEY -./fuzzit create job --type fuzzing --host bionic-llvm7 --revision $CI_ARROW_SHA --branch $CI_ARROW_BRANCH $TARGET_ID ./relwithdebinfo/out/fuzzer.tar.gz + +./fuzzit create job --type $FUZZIT_JOB_TYPE --host bionic-llvm7 --revision $CI_ARROW_SHA --branch $CI_ARROW_BRANCH $TARGET_ID ./relwithdebinfo/out/fuzzer.tar.gz diff --git a/docker-compose.yml b/docker-compose.yml index 0d0f6f9fc6bc..6ebf61646189 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -447,6 +447,7 @@ services: volumes: *ubuntu-volumes fuzzit: + # Submit a fuzzing job on fuzzit.dev # Usage: # docker-compose build cpp # docker-compose build fuzzit @@ -458,6 +459,41 @@ services: environment: CI_ARROW_SHA: ${CI_ARROW_SHA} CI_ARROW_BRANCH: ${CI_ARROW_BRANCH} + FUZZIT_JOB_TYPE: fuzzing + FUZZIT_API_KEY: c0b760d37db6012fcaafd8ca5f412ba7bcd297ac969650502994b51aa11798153824442f999a067e1ef67821989ed664 + volumes: *ubuntu-volumes + + fuzzit-regression: + # Submit a regression job on fuzzit.dev + # Usage: + # docker-compose build cpp + # docker-compose build fuzzit-regression + # docker-compose run fuzzit-regression + image: arrow:fuzzit + build: + context: . + dockerfile: dev/fuzzit/Dockerfile + environment: + CI_ARROW_SHA: ${CI_ARROW_SHA} + CI_ARROW_BRANCH: ${CI_ARROW_BRANCH} + FUZZIT_JOB_TYPE: regression + FUZZIT_API_KEY: c0b760d37db6012fcaafd8ca5f412ba7bcd297ac969650502994b51aa11798153824442f999a067e1ef67821989ed664 + volumes: *ubuntu-volumes + + fuzzit-local-regression: + # Run a local fuzzit regression job + # Usage: + # docker-compose build cpp + # docker-compose build fuzzit-local-regression + # docker-compose run fuzzit-local-regression + image: arrow:fuzzit + build: + context: . + dockerfile: dev/fuzzit/Dockerfile + environment: + CI_ARROW_SHA: ${CI_ARROW_SHA} + CI_ARROW_BRANCH: ${CI_ARROW_BRANCH} + FUZZIT_JOB_TYPE: local-regression volumes: *ubuntu-volumes iwyu: From fda549a74170efbbe0d2f37f2e34056b177e14ef Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 2 Oct 2019 19:09:27 -0400 Subject: [PATCH 172/201] ARROW-6761: [Rust] Travis build now uses the correct Rust toolchain There is a compiler bug apparently in Rust 1.40.0-nightly so we need to keep using 1.39.0 for now. The Travis build script was using a `RUSTUP_TOOLCHAIN=nightly` which seems to override the `rust-toolchain` file. Also, Travis builds the arrow subcrate individually and therefore wasn't picking up the `rust-toolchain` file in the root of the Rust project. Closes #5561 from andygrove/fix-travis-builds and squashes the following commits: 8293f44b8 no need to copy rust-toolchain c2a209f7c preserve original behavior of testing only after building arrow subcrate without default features a65b890ec try and fix build b53b26a58 fix error 81663b9ca Travis build now uses the correct Rust toolchain Authored-by: Andy Grove Signed-off-by: Paddy Horan --- .travis.yml | 2 +- ci/travis_script_rust.sh | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 534004d0d160..a107bcc3f4b9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -298,7 +298,7 @@ matrix: - if [ $ARROW_CI_RUST_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_cargo.sh script: - - RUSTUP_TOOLCHAIN=nightly $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh before_cache: cargo install cargo-tarpaulin -f after_success: diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 0f6f619e32e0..22059df228f4 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -31,11 +31,16 @@ rustup show # raises on any formatting errors cargo +stable fmt --all -- --check +# build entire project RUSTFLAGS="-D warnings" cargo build --all-targets + +# run tests +cargo test + +# make sure we can build Arrow sub-crate without default features pushd arrow cargo build --no-default-features popd -cargo test # run Arrow examples pushd arrow From ad4eccb36aebc3325de22f4f46ed1448254f15d7 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 3 Oct 2019 14:46:52 +0900 Subject: [PATCH 173/201] ARROW-6777: [GLib][CI] Unpin gobject-introspection gem gobject-introspection gem 3.3.8 or later is needed for GLib 2.62.0 or later. But gobject-introspection gem 3.3.8 has some problems. This change omits tests that are affected by these problems. Closes #5572 from kou/glib-unpin-gobject-introspection and squashes the following commits: 715b6d41d Unpin gobject-introspection gem Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/Gemfile | 2 +- c_glib/test/plasma/test-plasma-client.rb | 1 + .../test/plasma/test-plasma-created-object.rb | 3 ++ .../plasma/test-plasma-referred-object.rb | 3 ++ c_glib/test/test-cuda.rb | 2 ++ ruby/red-arrow/red-arrow.gemspec | 2 +- ruby/red-plasma/test/helper.rb | 1 + ruby/red-plasma/test/helper/omittable.rb | 36 +++++++++++++++++++ ruby/red-plasma/test/test-plasma-client.rb | 3 ++ 9 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 ruby/red-plasma/test/helper/omittable.rb diff --git a/c_glib/Gemfile b/c_glib/Gemfile index f42c1850a178..4b570902bcd7 100644 --- a/c_glib/Gemfile +++ b/c_glib/Gemfile @@ -20,4 +20,4 @@ source "https://rubygems.org/" gem "test-unit" -gem "gobject-introspection", "= 3.3.7" +gem "gobject-introspection" diff --git a/c_glib/test/plasma/test-plasma-client.rb b/c_glib/test/plasma/test-plasma-client.rb index a57d1fc5944e..14c65b4d65be 100644 --- a/c_glib/test/plasma/test-plasma-client.rb +++ b/c_glib/test/plasma/test-plasma-client.rb @@ -21,6 +21,7 @@ class TestPlasmaClient < Test::Unit::TestCase def setup @store = nil omit("Plasma is required") unless defined?(::Plasma) + require_gi_bindings(3, 3, 9) @store = Helper::PlasmaStore.new @store.start @options = Plasma::ClientOptions.new diff --git a/c_glib/test/plasma/test-plasma-created-object.rb b/c_glib/test/plasma/test-plasma-created-object.rb index 857322d20e14..8d036cda87a6 100644 --- a/c_glib/test/plasma/test-plasma-created-object.rb +++ b/c_glib/test/plasma/test-plasma-created-object.rb @@ -16,9 +16,12 @@ # under the License. class TestPlasmaCreatedObject < Test::Unit::TestCase + include Helper::Omittable + def setup @store = nil omit("Plasma is required") unless defined?(::Plasma) + require_gi_bindings(3, 3, 9) @store = Helper::PlasmaStore.new @store.start @client = Plasma::Client.new(@store.socket_path, nil) diff --git a/c_glib/test/plasma/test-plasma-referred-object.rb b/c_glib/test/plasma/test-plasma-referred-object.rb index a74641ed5dcd..ef4c72aee537 100644 --- a/c_glib/test/plasma/test-plasma-referred-object.rb +++ b/c_glib/test/plasma/test-plasma-referred-object.rb @@ -16,9 +16,12 @@ # under the License. class TestPlasmaReferredObject < Test::Unit::TestCase + include Helper::Omittable + def setup @store = nil omit("Plasma is required") unless defined?(::Plasma) + require_gi_bindings(3, 3, 9) @store = Helper::PlasmaStore.new @store.start @client = Plasma::Client.new(@store.socket_path, nil) diff --git a/c_glib/test/test-cuda.rb b/c_glib/test/test-cuda.rb index ae915307b70f..d24bab610c89 100644 --- a/c_glib/test/test-cuda.rb +++ b/c_glib/test/test-cuda.rb @@ -17,6 +17,7 @@ class TestCUDA < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable def setup omit("Arrow CUDA is required") unless defined?(::ArrowCUDA) @@ -47,6 +48,7 @@ def test_copy end def test_export + require_gi_bindings(3, 3, 9) @buffer.copy_from_host("Hello World") handle = @buffer.export serialized_handle = handle.serialize.data diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index 75430b5b979e..f4fcda90efe5 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -47,7 +47,7 @@ Gem::Specification.new do |spec| spec.extensions = ["ext/arrow/extconf.rb"] spec.add_runtime_dependency("extpp", ">= 0.0.7") - spec.add_runtime_dependency("gio2", "= 3.3.7") + spec.add_runtime_dependency("gio2", ">= 3.3.6") spec.add_runtime_dependency("native-package-installer") spec.add_runtime_dependency("pkg-config") diff --git a/ruby/red-plasma/test/helper.rb b/ruby/red-plasma/test/helper.rb index 255cad287004..02c545f53670 100644 --- a/ruby/red-plasma/test/helper.rb +++ b/ruby/red-plasma/test/helper.rb @@ -21,4 +21,5 @@ require "test-unit" +require_relative "helper/omittable" require_relative "helper/plasma-store" diff --git a/ruby/red-plasma/test/helper/omittable.rb b/ruby/red-plasma/test/helper/omittable.rb new file mode 100644 index 000000000000..a1c0334b63a2 --- /dev/null +++ b/ruby/red-plasma/test/helper/omittable.rb @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Helper + module Omittable + def require_gi_bindings(major, minor, micro) + return if GLib.check_binding_version?(major, minor, micro) + message = + "Require gobject-introspection #{major}.#{minor}.#{micro} or later: " + + GLib::BINDING_VERSION.join(".") + omit(message) + end + + def require_gi(major, minor, micro) + return if GObjectIntrospection::Version.or_later?(major, minor, micro) + message = + "Require GObject Introspection #{major}.#{minor}.#{micro} or later: " + + GObjectIntrospection::Version::STRING + omit(message) + end + end +end diff --git a/ruby/red-plasma/test/test-plasma-client.rb b/ruby/red-plasma/test/test-plasma-client.rb index de76fb9d36e8..d6182976c5fb 100644 --- a/ruby/red-plasma/test/test-plasma-client.rb +++ b/ruby/red-plasma/test/test-plasma-client.rb @@ -16,8 +16,11 @@ # under the License. class TestPlasmaClient < Test::Unit::TestCase + include Helper::Omittable + def setup @store = nil + require_gi_bindings(3, 3, 9) @store = Helper::PlasmaStore.new @store.start @id = Plasma::ObjectID.new("Hello") From 5050d87b5574afd0fae9e0e82a20ad12abfd6592 Mon Sep 17 00:00:00 2001 From: Taylor Baldwin Date: Thu, 3 Oct 2019 10:34:12 +0200 Subject: [PATCH 174/201] ARROW-6767: [JS] Lazily bind batches in scan/scanReverse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed some `TODO` comments in the JS client library that expressed interest in calling `bind(batch)` lazily. This PR implements that optimization and updates related tests. Happy to make updates per feedback! Created a JIRA issue [here](https://issues.apache.org/jira/browse/ARROW-6767). Closes #5565 from rolyatmax/tb/lazily-bind-batches and squashes the following commits: 8877a8f8f ARROW-6767: lazily bind batches in scan/scanReverse Authored-by: Taylor Baldwin Signed-off-by: Krisztián Szűcs --- js/src/compute/dataframe.ts | 30 ++++++++++++++++++++---------- js/test/unit/table-tests.ts | 28 ++++++++++++++-------------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/js/src/compute/dataframe.ts b/js/src/compute/dataframe.ts index e82e65cc1b93..db7f3d734063 100644 --- a/js/src/compute/dataframe.ts +++ b/js/src/compute/dataframe.ts @@ -132,14 +132,19 @@ export class FilteredDataFrame exte for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches const batch = batches[batchIndex]; - // TODO: bind batches lazily - // If predicate doesn't match anything in the batch we don't need - // to bind the callback - if (bind) { bind(batch); } const predicate = this._predicate.bind(batch); + let isBound = false; // yield all indices for (let index = -1, numRows = batch.length; ++index < numRows;) { - if (predicate(index, batch)) { next(index, batch); } + if (predicate(index, batch)) { + // bind batches lazily - if predicate doesn't match anything + // in the batch we don't need to call bind on the batch + if (bind && !isBound) { + bind(batch); + isBound = true; + } + next(index, batch); + } } } } @@ -149,14 +154,19 @@ export class FilteredDataFrame exte for (let batchIndex = numBatches; --batchIndex >= 0;) { // load batches const batch = batches[batchIndex]; - // TODO: bind batches lazily - // If predicate doesn't match anything in the batch we don't need - // to bind the callback - if (bind) { bind(batch); } const predicate = this._predicate.bind(batch); + let isBound = false; // yield all indices for (let index = batch.length; --index >= 0;) { - if (predicate(index, batch)) { next(index, batch); } + if (predicate(index, batch)) { + // bind batches lazily - if predicate doesn't match anything + // in the batch we don't need to call bind on the batch + if (bind && !isBound) { + bind(batch); + isBound = true; + } + next(index, batch); + } } } } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index ae2f058e7dae..da74b32977af 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -424,6 +424,10 @@ describe(`Table`, () => { get_i32 = col('i32').bind(batch); })), expected: values.filter((row) => (row[F32] as number) * (row[I32] as number) > 0) + }, { + name: `filter out all records`, + filtered: table.filter(lit(1).eq(0)), + expected: [] } ]; for (let this_test of filter_tests) { @@ -440,15 +444,13 @@ describe(`Table`, () => { expect(columns.map((c) => c.get(idx))).toEqual(expected[expected_idx++]); }); }); - test(`calls bind function on every batch`, () => { - // Techincally, we only need to call bind on - // batches with data that match the predicate, so - // this test may fail in the future if we change - // that - and that's ok! + test(`calls bind function lazily`, () => { let bind = jest.fn(); filtered.scan(() => { }, bind); - for (let batch of table.chunks) { - expect(bind).toHaveBeenCalledWith(batch); + if (expected.length) { + expect(bind).toHaveBeenCalled(); + } else { + expect(bind).not.toHaveBeenCalled(); } }); }); @@ -460,15 +462,13 @@ describe(`Table`, () => { expect(columns.map((c) => c.get(idx))).toEqual(expected[--expected_idx]); }); }); - test(`calls bind function on every batch`, () => { - // Techincally, we only need to call bind on - // batches with data that match the predicate, so - // this test may fail in the future if we change - // that - and that's ok! + test(`calls bind function lazily`, () => { let bind = jest.fn(); filtered.scanReverse(() => { }, bind); - for (let batch of table.chunks) { - expect(bind).toHaveBeenCalledWith(batch); + if (expected.length) { + expect(bind).toHaveBeenCalled(); + } else { + expect(bind).not.toHaveBeenCalled(); } }); }); From 1165cdb85b92cefcf59ac39d35f42d168cc64517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Oct 2019 10:37:51 +0200 Subject: [PATCH 175/201] ARROW-6686: [CI] Pull and push docker images to speed up the nightly builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The images are available under https://hub.docker.com/u/ursalab with arrow prefix. Also resolves https://issues.apache.org/jira/browse/ARROW-2602 Closes #5485 from kszucs/docker-push-pull and squashes the following commits: 086c7b34d hadolint f1b5956f7 use arrowdev org 2f8564fad push only images built from arrow's master branch 36bcaa907 update dockerfiles 44f058575 ignore pull failures a0b7fb1fd disable master only push for now 60c40f572 jinja 9718b00a6 quote 4c7cee0a1 try to fix circle yml d7ab350c1 increase no_output_timeout; use newer machine image with docker_layer_caching c6f71a99b push from master only 16ec7f98b pull and push docker images to dockerhub to speed up the builds Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- c_glib/Dockerfile | 2 +- dev/fuzzit/Dockerfile | 2 +- dev/lint/Dockerfile | 4 +- dev/tasks/docker-tests/circle.linux.yml | 23 +++++-- dev/tasks/tasks.yml | 90 ++++++++++++++++++++++++- docker-compose.yml | 80 +++++++++++----------- docs/Dockerfile | 2 +- integration/dask/Dockerfile | 2 +- integration/hdfs/Dockerfile | 2 +- integration/pandas/Dockerfile | 2 +- integration/spark/Dockerfile | 2 +- integration/turbodbc/Dockerfile | 2 +- python/Dockerfile | 2 +- python/Dockerfile.alpine | 2 +- python/Dockerfile.nopandas | 2 +- python/manylinux2010/README.md | 2 +- r/Dockerfile | 2 +- r/Dockerfile.conda | 2 +- 18 files changed, 161 insertions(+), 64 deletions(-) diff --git a/c_glib/Dockerfile b/c_glib/Dockerfile index 813695ccc7d0..32fbc79d05d6 100644 --- a/c_glib/Dockerfile +++ b/c_glib/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:cpp +FROM arrowdev/arrow-cpp:latest RUN apt-get update -y -q && \ apt-get -q install --no-install-recommends -y \ diff --git a/dev/fuzzit/Dockerfile b/dev/fuzzit/Dockerfile index 602cb5f254cd..447f1b844dac 100644 --- a/dev/fuzzit/Dockerfile +++ b/dev/fuzzit/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:cpp +FROM arrowdev/arrow-cpp:latest RUN apt-get update && \ apt-get install -y -q \ diff --git a/dev/lint/Dockerfile b/dev/lint/Dockerfile index c9658b8f1319..649120657587 100644 --- a/dev/lint/Dockerfile +++ b/dev/lint/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6 RUN apt-get update && \ apt-get install -y -q \ @@ -30,4 +30,4 @@ RUN conda install flake8 && \ ENV PATH=/opt/iwyu/bin:$PATH ADD ci/docker_install_iwyu.sh /arrow/ci/ -RUN arrow/ci/docker_install_iwyu.sh \ No newline at end of file +RUN arrow/ci/docker_install_iwyu.sh diff --git a/dev/tasks/docker-tests/circle.linux.yml b/dev/tasks/docker-tests/circle.linux.yml index 7b664eeafeda..9384a7ee1821 100644 --- a/dev/tasks/docker-tests/circle.linux.yml +++ b/dev/tasks/docker-tests/circle.linux.yml @@ -19,20 +19,29 @@ version: 2 jobs: build: machine: - image: circleci/classic:201808-01 + image: ubuntu-1604:201903-01 + docker_layer_caching: true steps: - run: docker -v - run: docker-compose -v + {% if arrow.branch == "master" %} + - run: | + if [ -n $DOCKER_USER ] && [ -n $DOCKER_PASS ]; then + docker login -u $DOCKER_USER -p $DOCKER_PASS + fi + {% endif %} - run: git clone --no-checkout {{ arrow.remote }} arrow - run: git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - run: git -C arrow checkout FETCH_HEAD - run: git -C arrow submodule update --init --recursive - - run: | - pushd arrow - {%- for command in commands %} - {{ command }} - {%- endfor %} - popd + - run: + command: | + pushd arrow + {%- for command in commands %} + {{ command }} + {%- endfor %} + popd + no_output_timeout: "1h" workflows: version: 2 diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 804ed8f701d0..3c19451d08b8 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1158,9 +1158,12 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures r - docker-compose build cpp - docker-compose build r - docker-compose run r + - docker-compose push --ignore-push-failures r docker-r-conda: ci: circle @@ -1168,9 +1171,12 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures r-conda - docker-compose build cpp - docker-compose build r-conda - docker-compose run r-conda + - docker-compose push --ignore-push-failures r-conda docker-r-sanitizer: ci: circle @@ -1178,8 +1184,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures r-sanitizer - docker-compose build r-sanitizer - docker-compose run r-sanitizer + - docker-compose push --ignore-push-failures r-sanitizer docker-rust: ci: circle @@ -1187,8 +1195,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures rust - docker-compose build rust - docker-compose run rust + - docker-compose push --ignore-push-failures rust docker-cpp: ci: circle @@ -1196,8 +1206,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp - docker-compose build cpp - docker-compose run cpp + - docker-compose push --ignore-push-failures cpp docker-cpp-release: ci: circle @@ -1205,8 +1217,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: - - docker-compose build cpp + - docker-compose pull --ignore-pull-failures cpp-release + - docker-compose build cpp-release - docker-compose run cpp-release + - docker-compose push --ignore-push-failures cpp-release docker-cpp-alpine: ci: circle @@ -1214,8 +1228,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp-alpine - docker-compose build cpp-alpine - docker-compose run cpp-alpine + - docker-compose push --ignore-push-failures cpp-alpine docker-cpp-cmake32: ci: circle @@ -1223,8 +1239,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp-cmake32 - docker-compose build cpp-cmake32 - docker-compose run cpp-cmake32 + - docker-compose push --ignore-push-failures cpp-cmake32 docker-cpp-fuzzit: ci: circle @@ -1232,9 +1250,12 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures fuzzit - docker-compose build cpp - docker-compose build fuzzit - docker-compose run fuzzit + - docker-compose push --ignore-push-failures fuzzit docker-cpp-static-only: ci: circle @@ -1242,8 +1263,11 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures cpp-static-only - docker-compose build cpp - docker-compose run cpp-static-only + - docker-compose push --ignore-push-failures cpp-static-only docker-c_glib: ci: circle @@ -1251,9 +1275,12 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures c_glib - docker-compose build cpp - docker-compose build c_glib - docker-compose run c_glib + - docker-compose push --ignore-push-failures c_glib docker-go: ci: circle @@ -1261,8 +1288,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures go - docker-compose build go - docker-compose run go + - docker-compose push --ignore-push-failures go docker-js: ci: circle @@ -1270,8 +1299,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures js - docker-compose build js - docker-compose run js + - docker-compose push --ignore-push-failures js docker-java: ci: circle @@ -1279,8 +1310,10 @@ tasks: template: docker-tests/circle.linux.yml params: commands: + - docker-compose pull --ignore-pull-failures java - docker-compose build java - docker-compose run java + - docker-compose push --ignore-push-failures java docker-python-2.7: ci: circle @@ -1290,9 +1323,12 @@ tasks: environment: PYTHON_VERSION: 2.7 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python - docker-compose build cpp - docker-compose build python - docker-compose run python + - docker-compose push --ignore-push-failures python docker-python-3.6: ci: circle @@ -1302,9 +1338,12 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python - docker-compose build cpp - docker-compose build python - docker-compose run python + - docker-compose push --ignore-push-failures python docker-python-3.7: ci: circle @@ -1314,9 +1353,12 @@ tasks: environment: PYTHON_VERSION: 3.7 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python - docker-compose build cpp - docker-compose build python - docker-compose run python + - docker-compose push --ignore-push-failures python docker-python-2.7-alpine: ci: circle @@ -1326,9 +1368,12 @@ tasks: environment: PYTHON_VERSION: 2.7 commands: + - docker-compose pull --ignore-pull-failures cpp-alpine + - docker-compose pull --ignore-pull-failures python-alpine - docker-compose build cpp-alpine - docker-compose build python-alpine - docker-compose run python-alpine + - docker-compose push --ignore-push-failures python-alpine docker-python-3.6-alpine: ci: circle @@ -1338,9 +1383,12 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp-alpine + - docker-compose pull --ignore-pull-failures python-alpine - docker-compose build cpp-alpine - docker-compose build python-alpine - docker-compose run python-alpine + - docker-compose push --ignore-push-failures python-alpine docker-python-2.7-nopandas: ci: circle @@ -1350,10 +1398,14 @@ tasks: environment: PYTHON_VERSION: 2.7 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures python-nopandas - docker-compose build cpp - docker-compose build python - docker-compose build python-nopandas - docker-compose run python-nopandas + - docker-compose push --ignore-push-failures python-nopandas docker-python-3.6-nopandas: ci: circle @@ -1363,10 +1415,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures python-nopandas - docker-compose build cpp - docker-compose build python - docker-compose build python-nopandas - docker-compose run python-nopandas + - docker-compose push --ignore-push-failures python-nopandas ###################### Documentation building tests ######################### @@ -1378,10 +1434,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures docs - docker-compose build cpp - docker-compose build python - docker-compose build docs - docker-compose run docs + - docker-compose push --ignore-push-failures docs ############################## Linter tests ################################# @@ -1393,10 +1453,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures lint - docker-compose build cpp - docker-compose build python - docker-compose build lint - docker-compose run lint + - docker-compose push --ignore-push-failures lint docker-iwyu: ci: circle @@ -1406,6 +1470,9 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures lint - docker-compose build cpp - docker-compose build python - docker-compose build lint @@ -1419,6 +1486,9 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures lint - docker-compose build cpp - docker-compose build python - docker-compose build lint @@ -1434,10 +1504,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures dask-integration - docker-compose build cpp - docker-compose build python - docker-compose build dask-integration - docker-compose run dask-integration + - docker-compose push --ignore-push-failures dask-integration docker-hdfs-integration: ci: circle @@ -1447,10 +1521,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures hdfs-integration - docker-compose build cpp - docker-compose build python - docker-compose build hdfs-integration - docker-compose run hdfs-integration + - docker-compose push --ignore-push-failures hdfs-integration docker-spark-integration: ci: circle @@ -1460,10 +1538,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures spark-integration - docker-compose build cpp - docker-compose build python - docker-compose build spark-integration - docker-compose run spark-integration + - docker-compose push --ignore-push-failures spark-integration docker-turbodbc-integration: ci: circle @@ -1473,10 +1555,14 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python + - docker-compose pull --ignore-pull-failures turbodbc-integration - docker-compose build cpp - docker-compose build python - docker-compose build turbodbc-integration - docker-compose run turbodbc-integration + - docker-compose push --ignore-push-failures turbodbc-integration docker-pandas-master: ci: circle @@ -1486,6 +1572,8 @@ tasks: environment: PYTHON_VERSION: 3.6 commands: + - docker-compose pull --ignore-pull-failures cpp + - docker-compose pull --ignore-pull-failures python - docker-compose build cpp - docker-compose build python - docker-compose build --no-cache pandas-master diff --git a/docker-compose.yml b/docker-compose.yml index 6ebf61646189..0ce7e38bc50d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -87,7 +87,7 @@ services: # Usage: # docker-compose build build-context # docker-compose run build-context - image: arrow:build-context + image: arrowdev/arrow-build-context:latest build: context: . dockerfile: dev/Dockerfile.context @@ -99,7 +99,7 @@ services: # docker-compose build cpp # docker-compose build c_glib # docker-compose run c_glib - image: arrow:c_glib + image: arrowdev/arrow-c_glib:latest build: context: . dockerfile: c_glib/Dockerfile @@ -109,7 +109,7 @@ services: # Usage: # docker-compose build cpp # docker-compose run cpp - image: arrow:cpp + image: arrowdev/arrow-cpp:latest shm_size: 2G build: context: . @@ -120,7 +120,7 @@ services: # Usage: # docker-compose build cpp-system-deps # docker-compose run cpp-system-deps - image: ursalab/arrow:cpp-system-deps + image: arrowdev/arrow-cpp-system-deps:latest shm_size: 2G cap_add: # LeakSanitizer and gdb requires ptrace(2) @@ -142,7 +142,7 @@ services: # Usage: # docker-compose build cpp # docker-compose run cpp-release - image: arrow:cpp + image: arrowdev/arrow-cpp-release:latest shm_size: 2G build: context: . @@ -156,7 +156,7 @@ services: # Usage: # docker-compose build cpp # docker-compose run cpp-static-only - image: arrow:cpp + image: arrowdev/arrow-cpp:latest shm_size: 2G environment: ARROW_BUILD_SHARED: "OFF" @@ -168,7 +168,7 @@ services: # Usage: # docker-compose build cpp # docker-compose run cpp-static-only - image: arrow:cpp + image: arrowdev/arrow-cpp:latest shm_size: 2G environment: ARROW_BUILD_STATIC: "OFF" @@ -179,7 +179,7 @@ services: # Usage: # docker-compose build cpp-cmake32 # docker-compose run cpp-cmake32 - image: arrow:cpp-cmake32 + image: arrowdev/arrow-cpp-cmake32:latest shm_size: 2G build: context: . @@ -199,7 +199,7 @@ services: # Usage: # docker-compose build cpp-alpine # docker-compose run cpp-alpine - image: arrow:cpp-alpine + image: arrowdev/arrow-cpp-alpine:latest shm_size: 2G build: context: . @@ -212,7 +212,7 @@ services: # Usage: # docker-compose build cpp-fedora # docker-compose run cpp-fedora - image: arrow:cpp-fedora + image: arrowdev/arrow-cpp-fedora:latest shm_size: 2G build: context: . @@ -225,7 +225,7 @@ services: # Usage: # docker-compose build cpp-debian-testing # docker-compose run cpp-debian-testing - image: arrow:cpp-debian-testing + image: arrowdev/arrow-cpp-debian-testing:latest shm_size: 2G build: context: . @@ -238,7 +238,7 @@ services: # Usage: # docker-compose build cpp-ubuntu-trusty # docker-compose run cpp-ubuntu-trusty - image: arrow:cpp-ubuntu-trusty + image: arrowdev/arrow-cpp-ubuntu-trusty:latest shm_size: 2G build: context: . @@ -251,7 +251,7 @@ services: # Usage: # docker-compose build cpp-ubuntu-xenial # docker-compose run cpp-ubuntu-xenial - image: arrow:cpp-ubuntu-xenial + image: arrowdev/arrow-cpp-ubuntu-xenial:latest shm_size: 2G build: context: . @@ -266,7 +266,7 @@ services: # Usage: # docker-compose build cpp-minimum-boost-version # docker-compose run cpp-minimum-boost-version - image: arrow:cpp-ubuntu-xenial + image: arrowdev/arrow-cpp-ubuntu-xenial:latest shm_size: 2G build: context: . @@ -281,7 +281,7 @@ services: # Usage: # docker-compose build cpp-ubuntu-bionic # docker-compose run cpp-ubuntu-bionic - image: arrow:cpp-ubuntu-bionic + image: arrowdev/arrow-cpp-ubuntu-bionic:latest shm_size: 2G build: context: . @@ -294,7 +294,7 @@ services: # Usage: # docker-compose build go # docker-compose run go - image: arrow:go + image: arrowdev/arrow-go:latest build: context: . dockerfile: go/Dockerfile @@ -304,7 +304,7 @@ services: # Usage: # docker-compose build java # docker-compose run java - image: arrow:java + image: arrowdev/arrow-java:latest build: context: . dockerfile: java/Dockerfile @@ -316,7 +316,7 @@ services: # Usage: # docker-compose build java-all-jdks # docker-compose run java-all-jdks - image: ursalab/arrow-ci-java-all-jdks:latest + image: arrowdev/arrow-java-all-jdks:latest build: context: . dockerfile: java/Dockerfile.all-jdks @@ -325,7 +325,7 @@ services: - maven-cache:/root/.m2:delegated js: - image: arrow:js + image: arrowdev/arrow-js:latest build: context: . dockerfile: js/Dockerfile @@ -336,7 +336,7 @@ services: # docker-compose build cpp # docker-compose build python # docker-compose run python - image: arrow:python-${PYTHON_VERSION:-3.6} + image: arrowdev/arrow-python-${PYTHON_VERSION:-3.6}:latest shm_size: 2G build: context: . @@ -351,7 +351,7 @@ services: # docker-compose build cpp-alpine # docker-compose build python-alpine # docker-compose run python-alpine - image: arrow:python-${PYTHON_VERSION:-3.6}-alpine + image: arrowdev/arrow-python-${PYTHON_VERSION:-3.6}-alpine:latest shm_size: 2G build: context: . @@ -367,7 +367,7 @@ services: # docker-compose build python # docker-compose build python-nopandas # docker-compose run python-nopandas - image: arrow:python-${PYTHON_VERSION:-3.6}-nopandas + image: arrowdev/arrow-python-${PYTHON_VERSION:-3.6}-nopandas:latest shm_size: 2G build: context: . @@ -380,7 +380,7 @@ services: # Usage: # docker-compose build rust # docker-compose run rust - image: arrow:rust + image: arrowdev/arrow-rust:latest build: context: . dockerfile: rust/Dockerfile @@ -393,7 +393,7 @@ services: # docker-compose build cpp # docker-compose build r # docker-compose run r - image: arrow:r + image: arrowdev/arrow-r:latest build: context: . dockerfile: r/Dockerfile @@ -403,7 +403,7 @@ services: # Usage: # docker-compose build r-sanitizer # docker-compose run r-sanitizer - image: arrow:r-sanitizer + image: arrowdev/arrow-r-sanitizer:latest cap_add: # LeakSanitizer and gdb requires ptrace(2) - SYS_PTRACE @@ -418,7 +418,7 @@ services: # docker-compose build cpp # docker-compose build r-conda # docker-compose run r-conda - image: arrow:r-conda-${R_VERSION:-3.5.1} + image: arrowdev/arrow-r-conda-${R_VERSION:-3.5.1}:latest shm_size: 2G build: context: . @@ -439,7 +439,7 @@ services: # docker-compose build python # docker-compose build lint # docker-compose run lint - image: arrow:lint + image: arrowdev/arrow-lint:latest build: context: . dockerfile: dev/lint/Dockerfile @@ -452,7 +452,7 @@ services: # docker-compose build cpp # docker-compose build fuzzit # docker-compose run fuzzit - image: arrow:fuzzit + image: arrowdev/arrow-fuzzit:latest build: context: . dockerfile: dev/fuzzit/Dockerfile @@ -503,7 +503,7 @@ services: # docker-compose build python # docker-compose build lint # docker-compose run iwyu - image: arrow:lint + image: arrowdev/arrow-lint:latest command: arrow/dev/lint/run_iwyu.sh volumes: *ubuntu-volumes @@ -513,7 +513,7 @@ services: # docker-compose build python # docker-compose build lint # docker-compose run clang-format - image: arrow:lint + image: arrowdev/arrow-lint:latest command: arrow/dev/lint/run_clang_format.sh volumes: *ubuntu-volumes @@ -523,7 +523,7 @@ services: # docker-compose build python # docker-compose build lint # docker-compose run clang-tidy - image: arrow:lint + image: arrowdev/arrow-lint:latest command: arrow/dev/lint/run_clang_tidy.sh volumes: *ubuntu-volumes @@ -533,7 +533,7 @@ services: # docker-compose build python # docker-compose build docs # docker-compose run docs - image: arrow:docs + image: arrowdev/arrow-docs:latest build: context: . dockerfile: docs/Dockerfile @@ -549,7 +549,7 @@ services: # $ docker-compose pull python-manylinux1 # an then run: # $ docker-compose run -e PYTHON_VERSION=3.7 python-manylinux1 - image: ursalab/arrow_manylinux1_x86_64_base:0.14.1-static-zlib + image: arrowdev/arrow_manylinux1_x86_64_base:0.14.1-static-zlib build: context: python/manylinux1 dockerfile: Dockerfile-x86_64_base @@ -571,7 +571,7 @@ services: # $ docker-compose pull python-manylinux2010 # an then run: # $ docker-compose run -e PYTHON_VERSION=3.7 python-manylinux2010 - image: ursalab/arrow_manylinux2010_x86_64_base:0.14.1-static-zlib + image: arrowdev/arrow_manylinux2010_x86_64_base:0.14.1-static-zlib build: context: python/manylinux2010 dockerfile: Dockerfile-x86_64_base @@ -600,7 +600,7 @@ services: # docker-compose build python # docker-compose build --no-cache pandas-master # docker-compose run pandas-master - image: arrow:pandas-master + image: arrowdev/arrow-pandas-master:latest build: context: . dockerfile: integration/pandas/Dockerfile @@ -643,7 +643,7 @@ services: # docker-compose build python # docker-compose build hdfs-integration # docker-compose run hdfs-integration - image: arrow:hdfs-${HDFS_VERSION:-2.9.2} + image: arrowdev/arrow-hdfs-${HDFS_VERSION:-2.9.2}:latest links: - hdfs-namenode:hdfs-namenode - hdfs-datanode-1:hdfs-datanode-1 @@ -667,7 +667,7 @@ services: # docker-compose build python # docker-compose build dask-integration # docker-compose run dask-integration - image: arrow:dask-integration + image: arrowdev/arrow-dask-integration:latest build: context: . dockerfile: integration/dask/Dockerfile @@ -680,7 +680,7 @@ services: # docker-compose build python # docker-compose build turbodbc-integration # docker-compose run turbodbc-integration - image: arrow:turbodbc-integration + image: arrowdev/arrow-turbodbc-integration:latest build: context: . dockerfile: integration/turbodbc/Dockerfile @@ -693,7 +693,7 @@ services: # docker-compose build python # docker-compose build spark-integration # docker-compose run spark-integration - image: arrow:spark-${SPARK_VERSION:-master} + image: arrowdev/arrow-spark-${SPARK_VERSION:-master}:latest environment: - SPARK_VERSION=${SPARK_VERSION:-master} build: @@ -710,7 +710,7 @@ services: ################################# Release ################################### release-source: - image: arrow:release-source + image: arrowdev/arrow-release-source:latest build: context: . dockerfile: dev/release/source/Dockerfile diff --git a/docs/Dockerfile b/docs/Dockerfile index 8c2d00b99e79..016d334f7b49 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6:latest COPY ci/conda_env_sphinx.yml /arrow/ci/ RUN conda install --file arrow/ci/conda_env_sphinx.yml && \ diff --git a/integration/dask/Dockerfile b/integration/dask/Dockerfile index ceadb3ef8b77..a67c99072c49 100644 --- a/integration/dask/Dockerfile +++ b/integration/dask/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6:latest # install dask release from conda RUN conda install -c conda-forge dask pytest=3 && \ diff --git a/integration/hdfs/Dockerfile b/integration/hdfs/Dockerfile index f173b39e0bae..71db503436d0 100644 --- a/integration/hdfs/Dockerfile +++ b/integration/hdfs/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6:latest # installing libhdfs (JNI) ARG HADOOP_VERSION=2.9.2 diff --git a/integration/pandas/Dockerfile b/integration/pandas/Dockerfile index 8b56edf85d03..a1ae23e63321 100644 --- a/integration/pandas/Dockerfile +++ b/integration/pandas/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6:latest # test against pandas' master RUN pip install git+https://github.com/pandas-dev/pandas.git diff --git a/integration/spark/Dockerfile b/integration/spark/Dockerfile index 6b032034986b..a0e7185d6da3 100644 --- a/integration/spark/Dockerfile +++ b/integration/spark/Dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6:latest # installing java and maven ARG MAVEN_VERSION=3.6.2 diff --git a/integration/turbodbc/Dockerfile b/integration/turbodbc/Dockerfile index 26df8ad709d7..d143ea748c99 100644 --- a/integration/turbodbc/Dockerfile +++ b/integration/turbodbc/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:python-3.6 +FROM arrowdev/arrow-python-3.6:latest RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update -y -q && \ diff --git a/python/Dockerfile b/python/Dockerfile index 5fcadeb2fb66..71bc2b2d4ee1 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:cpp +FROM arrowdev/arrow-cpp:latest # install python specific packages ARG PYTHON_VERSION=3.6 diff --git a/python/Dockerfile.alpine b/python/Dockerfile.alpine index ed7b2d2a13db..87a1dff67acc 100644 --- a/python/Dockerfile.alpine +++ b/python/Dockerfile.alpine @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:cpp-alpine +FROM arrowdev/arrow-cpp-alpine:latest SHELL ["/bin/bash", "-o", "pipefail", "-c"] diff --git a/python/Dockerfile.nopandas b/python/Dockerfile.nopandas index 58aaeed5b1ab..2faad084488a 100644 --- a/python/Dockerfile.nopandas +++ b/python/Dockerfile.nopandas @@ -16,7 +16,7 @@ # under the License. ARG PYTHON_VERSION=3.6 -FROM arrow:python-$PYTHON_VERSION +FROM arrowdev/arrow-python-$PYTHON_VERSION:latest # not installing pandas doesn't mean that it's not grabbed as a transitive # dependency, so We remove it explicitly to make sure We don't have pandas diff --git a/python/manylinux2010/README.md b/python/manylinux2010/README.md index fe2888e89df8..3a9eee750aa1 100644 --- a/python/manylinux2010/README.md +++ b/python/manylinux2010/README.md @@ -70,7 +70,7 @@ publish a build image that you built locally. ```bash $ docker push python-manylinux2010 -The push refers to repository [ursalab/arrow_manylinux2010_x86_64_base] +The push refers to repository [arrowdev/arrow_manylinux2010_x86_64_base] a1ab88d27acc: Pushing [==============> ] 492.5MB/1.645GB [... etc. ...] ``` diff --git a/r/Dockerfile b/r/Dockerfile index 97aa64b95234..63a200b8432e 100644 --- a/r/Dockerfile +++ b/r/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:cpp +FROM arrowdev/arrow-cpp:latest # Configure ENV CC=gcc \ diff --git a/r/Dockerfile.conda b/r/Dockerfile.conda index 15279916e2e4..f4ab0d2974aa 100644 --- a/r/Dockerfile.conda +++ b/r/Dockerfile.conda @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM arrow:cpp +FROM arrowdev/arrow-cpp:latest # install R specific packages ARG R_VERSION=3.6.1 From 560a5973aa48a24153504e8c2465c5c6998c4844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Oct 2019 10:48:27 +0200 Subject: [PATCH 176/201] ARROW-6770: [CI][Travis] Download Minio quietly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5568 from kszucs/ARROW-6770 and squashes the following commits: ef4fd495f install minio silently Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- ci/travis_install_minio.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/travis_install_minio.sh b/ci/travis_install_minio.sh index 5459bb8bfc4a..d3083ede18c0 100755 --- a/ci/travis_install_minio.sh +++ b/ci/travis_install_minio.sh @@ -30,6 +30,6 @@ if [ "$ARROW_TRAVIS_S3" == "1" ]; then S3FS_DIR=~/.local/bin/ mkdir -p $S3FS_DIR - wget --directory-prefix $S3FS_DIR $MINIO_URL + wget --quiet --directory-prefix $S3FS_DIR $MINIO_URL chmod +x $S3FS_DIR/minio -fi \ No newline at end of file +fi From f2e8f85a4fdee8d5e70e95b4b37787d3956bf907 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 3 Oct 2019 07:58:38 -0400 Subject: [PATCH 177/201] ARROW-6773: [C++] Fix filter kernel when filtering with a boolean Array slice Closes #5570 from nealrichardson/fix-filter-slice and squashes the following commits: 3a772fcc5 Lint again f7e9475b7 lint 96220d179 Test and fix for bug in filter kernel with a slice Authored-by: Neal Richardson Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/compute/kernels/filter.cc | 4 +--- cpp/src/arrow/compute/kernels/filter_test.cc | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/filter.cc index f985886973bd..86545aa1c61d 100644 --- a/cpp/src/arrow/compute/kernels/filter.cc +++ b/cpp/src/arrow/compute/kernels/filter.cc @@ -65,10 +65,8 @@ class FilterIndexSequence { // TODO(bkietz) this can be optimized static int64_t OutputSize(const BooleanArray& filter) { - auto offset = filter.offset(); - auto length = filter.length(); int64_t size = 0; - for (auto i = offset; i < offset + length; ++i) { + for (auto i = 0; i < filter.length(); ++i) { if (filter.IsNull(i) || filter.Value(i)) { ++size; } diff --git a/cpp/src/arrow/compute/kernels/filter_test.cc b/cpp/src/arrow/compute/kernels/filter_test.cc index 37f609e92d5c..bb685f474dc6 100644 --- a/cpp/src/arrow/compute/kernels/filter_test.cc +++ b/cpp/src/arrow/compute/kernels/filter_test.cc @@ -151,6 +151,10 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) { this->AssertFilter("[7, 8, 9]", "[null, 1, 0]", "[null, 8]"); this->AssertFilter("[7, 8, 9]", "[1, null, 1]", "[7, null, 9]"); + this->AssertFilterArrays(ArrayFromJSON(this->type_singleton(), "[7, 8, 9]"), + ArrayFromJSON(boolean(), "[0, 1, 1, 1, 0, 1]")->Slice(3, 3), + ArrayFromJSON(this->type_singleton(), "[7, 9]")); + std::shared_ptr arr; ASSERT_RAISES(Invalid, this->Filter(this->type_singleton(), "[7, 8, 9]", "[]", &arr)); } From 31a3259c50fb1e6643695923deccd88dd664a40b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 3 Oct 2019 16:57:05 +0200 Subject: [PATCH 178/201] ARROW-6762: [C++] Support reading JSON files with no newline at end Also fix some lifetime issues in parallel mode, and add tests. Closes #5564 from pitrou/ARROW-6762-json-parser-trailing-newline and squashes the following commits: 562783d33 ARROW-6762: Support reading JSON files with no newline at end Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/json/chunked_builder.cc | 55 +++++++++-------- cpp/src/arrow/json/chunked_builder.h | 2 +- cpp/src/arrow/json/chunker.cc | 75 ++++++++++++++++++----- cpp/src/arrow/json/chunker.h | 33 ++++++++-- cpp/src/arrow/json/chunker_test.cc | 88 ++++++++++++++++++++++++--- cpp/src/arrow/json/options.h | 3 +- cpp/src/arrow/json/reader.cc | 64 +++++++++++-------- cpp/src/arrow/json/reader_test.cc | 16 +++++ python/pyarrow/tests/test_json.py | 83 +++++++++++++++++++++++++ 9 files changed, 338 insertions(+), 81 deletions(-) diff --git a/cpp/src/arrow/json/chunked_builder.cc b/cpp/src/arrow/json/chunked_builder.cc index f7f58e59a2a7..8c9206168db2 100644 --- a/cpp/src/arrow/json/chunked_builder.cc +++ b/cpp/src/arrow/json/chunked_builder.cc @@ -26,7 +26,6 @@ #include "arrow/json/converter.h" #include "arrow/table.h" #include "arrow/util/logging.h" -#include "arrow/util/stl.h" #include "arrow/util/task_group.h" namespace arrow { @@ -60,7 +59,9 @@ class NonNestedChunkedArrayBuilder : public ChunkedArrayBuilder { std::shared_ptr converter_; }; -class TypedChunkedArrayBuilder : public NonNestedChunkedArrayBuilder { +class TypedChunkedArrayBuilder + : public NonNestedChunkedArrayBuilder, + public std::enable_shared_from_this { public: using NonNestedChunkedArrayBuilder::NonNestedChunkedArrayBuilder; @@ -72,17 +73,21 @@ class TypedChunkedArrayBuilder : public NonNestedChunkedArrayBuilder { } lock.unlock(); - task_group_->Append([this, block_index, unconverted] { + auto self = shared_from_this(); + + task_group_->Append([self, block_index, unconverted] { std::shared_ptr converted; - RETURN_NOT_OK(converter_->Convert(unconverted, &converted)); - std::unique_lock lock(mutex_); - chunks_[block_index] = std::move(converted); + RETURN_NOT_OK(self->converter_->Convert(unconverted, &converted)); + std::unique_lock lock(self->mutex_); + self->chunks_[block_index] = std::move(converted); return Status::OK(); }); } }; -class InferringChunkedArrayBuilder : public NonNestedChunkedArrayBuilder { +class InferringChunkedArrayBuilder + : public NonNestedChunkedArrayBuilder, + public std::enable_shared_from_this { public: InferringChunkedArrayBuilder(const std::shared_ptr& task_group, const PromotionGraph* promotion_graph, @@ -105,8 +110,9 @@ class InferringChunkedArrayBuilder : public NonNestedChunkedArrayBuilder { } void ScheduleConvertChunk(int64_t block_index) { - task_group_->Append([this, block_index] { - return TryConvertChunk(static_cast(block_index)); + auto self = shared_from_this(); + task_group_->Append([self, block_index] { + return self->TryConvertChunk(static_cast(block_index)); }); } @@ -173,7 +179,7 @@ class InferringChunkedArrayBuilder : public NonNestedChunkedArrayBuilder { class ChunkedListArrayBuilder : public ChunkedArrayBuilder { public: ChunkedListArrayBuilder(const std::shared_ptr& task_group, MemoryPool* pool, - std::unique_ptr value_builder, + std::shared_ptr value_builder, const std::shared_ptr& value_field) : ChunkedArrayBuilder(task_group), pool_(pool), @@ -250,7 +256,7 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder { std::mutex mutex_; MemoryPool* pool_; - std::unique_ptr value_builder_; + std::shared_ptr value_builder_; BufferVector offset_chunks_, null_bitmap_chunks_; std::shared_ptr value_field_; }; @@ -260,7 +266,7 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { ChunkedStructArrayBuilder( const std::shared_ptr& task_group, MemoryPool* pool, const PromotionGraph* promotion_graph, - std::vector>> + std::vector>> name_builders) : ChunkedArrayBuilder(task_group), pool_(pool), promotion_graph_(promotion_graph) { for (auto&& name_builder : name_builders) { @@ -390,7 +396,7 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { auto new_index = static_cast(name_to_index_.size()); it = name_to_index_.emplace(fields[i]->name(), new_index).first; - std::unique_ptr child_builder; + std::shared_ptr child_builder; RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group_, pool_, promotion_graph_, type, &child_builder)); child_builders_.emplace_back(std::move(child_builder)); @@ -411,7 +417,7 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { MemoryPool* pool_; const PromotionGraph* promotion_graph_; std::unordered_map name_to_index_; - std::vector> child_builders_; + std::vector> child_builders_; std::vector> child_absent_; BufferVector null_bitmap_chunks_; std::vector chunk_lengths_; @@ -420,37 +426,36 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { Status MakeChunkedArrayBuilder(const std::shared_ptr& task_group, MemoryPool* pool, const PromotionGraph* promotion_graph, const std::shared_ptr& type, - std::unique_ptr* out) { + std::shared_ptr* out) { if (type->id() == Type::STRUCT) { - std::vector>> + std::vector>> child_builders; for (const auto& f : type->children()) { - std::unique_ptr child_builder; + std::shared_ptr child_builder; RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph, f->type(), &child_builder)); child_builders.emplace_back(f->name(), std::move(child_builder)); } - *out = internal::make_unique( - task_group, pool, promotion_graph, std::move(child_builders)); + *out = std::make_shared(task_group, pool, promotion_graph, + std::move(child_builders)); return Status::OK(); } if (type->id() == Type::LIST) { auto list_type = static_cast(type.get()); - std::unique_ptr value_builder; + std::shared_ptr value_builder; RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph, list_type->value_type(), &value_builder)); - *out = internal::make_unique( + *out = std::make_shared( task_group, pool, std::move(value_builder), list_type->value_field()); return Status::OK(); } std::shared_ptr converter; RETURN_NOT_OK(MakeConverter(type, pool, &converter)); if (promotion_graph) { - *out = internal::make_unique( - task_group, promotion_graph, std::move(converter)); + *out = std::make_shared(task_group, promotion_graph, + std::move(converter)); } else { - *out = - internal::make_unique(task_group, std::move(converter)); + *out = std::make_shared(task_group, std::move(converter)); } return Status::OK(); } diff --git a/cpp/src/arrow/json/chunked_builder.h b/cpp/src/arrow/json/chunked_builder.h index b2cfbefdf45b..f872c723d78c 100644 --- a/cpp/src/arrow/json/chunked_builder.h +++ b/cpp/src/arrow/json/chunked_builder.h @@ -70,7 +70,7 @@ class ARROW_EXPORT ChunkedArrayBuilder { ARROW_EXPORT Status MakeChunkedArrayBuilder( const std::shared_ptr& task_group, MemoryPool* pool, const PromotionGraph* promotion_graph, const std::shared_ptr& type, - std::unique_ptr* out); + std::shared_ptr* out); } // namespace json } // namespace arrow diff --git a/cpp/src/arrow/json/chunker.cc b/cpp/src/arrow/json/chunker.cc index 5ec45d787030..cd21ca10c740 100644 --- a/cpp/src/arrow/json/chunker.cc +++ b/cpp/src/arrow/json/chunker.cc @@ -38,7 +38,8 @@ using internal::make_unique; using util::string_view; static Status StraddlingTooLarge() { - return Status::Invalid("straddling object straddles two block boundaries"); + return Status::Invalid( + "straddling object straddles two block boundaries (try to increase block size?)"); } static size_t ConsumeWhitespace(std::shared_ptr* buf) { @@ -58,9 +59,11 @@ static size_t ConsumeWhitespace(std::shared_ptr* buf) { #endif } +// A chunker implementation that assumes JSON objects don't contain raw newlines. +// This allows fast chunk delimitation using a simple newline search. class NewlinesStrictlyDelimitChunker : public Chunker { public: - Status Process(const std::shared_ptr& block, std::shared_ptr* whole, + Status Process(std::shared_ptr block, std::shared_ptr* whole, std::shared_ptr* partial) override { auto last_newline = string_view(*block).find_last_of("\n\r"); if (last_newline == string_view::npos) { @@ -74,11 +77,24 @@ class NewlinesStrictlyDelimitChunker : public Chunker { return Status::OK(); } - Status ProcessWithPartial(const std::shared_ptr& partial_original, - const std::shared_ptr& block, + Status ProcessWithPartial(std::shared_ptr partial_original, + std::shared_ptr block, std::shared_ptr* completion, std::shared_ptr* rest) override { - auto partial = partial_original; + return DoProcessWithPartial(partial_original, block, false, completion, rest); + } + + Status ProcessFinal(std::shared_ptr partial_original, + std::shared_ptr block, std::shared_ptr* completion, + std::shared_ptr* rest) override { + return DoProcessWithPartial(partial_original, block, true, completion, rest); + } + + protected: + Status DoProcessWithPartial(std::shared_ptr partial, + std::shared_ptr block, bool is_final, + std::shared_ptr* completion, + std::shared_ptr* rest) { ConsumeWhitespace(&partial); if (partial->size() == 0) { // if partial is empty, don't bother looking for completion @@ -88,9 +104,16 @@ class NewlinesStrictlyDelimitChunker : public Chunker { } auto first_newline = string_view(*block).find_first_of("\n\r"); if (first_newline == string_view::npos) { - // no newlines in this block; straddling object straddles *two* block boundaries. - // retry with larger buffer - return StraddlingTooLarge(); + // no newlines in this block + if (is_final) { + // => it's entirely a completion of partial + *completion = block; + *rest = SliceBuffer(block, 0, 0); + return Status::OK(); + } else { + // => the current object is too large for block size + return StraddlingTooLarge(); + } } *completion = SliceBuffer(block, 0, first_newline + 1); *rest = SliceBuffer(block, first_newline + 1); @@ -164,9 +187,11 @@ static size_t ConsumeWholeObject(Stream&& stream) { } } +// A chunker implementation that assumes JSON objects can contain raw newlines, +// and uses actual JSON parsing to delimit chunks. class ParsingChunker : public Chunker { public: - Status Process(const std::shared_ptr& block, std::shared_ptr* whole, + Status Process(std::shared_ptr block, std::shared_ptr* whole, std::shared_ptr* partial) override { if (block->size() == 0) { *whole = SliceBuffer(block, 0, 0); @@ -194,11 +219,24 @@ class ParsingChunker : public Chunker { return Status::OK(); } - Status ProcessWithPartial(const std::shared_ptr& partial_original, - const std::shared_ptr& block, + Status ProcessWithPartial(std::shared_ptr partial_original, + std::shared_ptr block, std::shared_ptr* completion, std::shared_ptr* rest) override { - auto partial = partial_original; + return DoProcessWithPartial(partial_original, block, false, completion, rest); + } + + Status ProcessFinal(std::shared_ptr partial_original, + std::shared_ptr block, std::shared_ptr* completion, + std::shared_ptr* rest) override { + return DoProcessWithPartial(partial_original, block, true, completion, rest); + } + + protected: + Status DoProcessWithPartial(std::shared_ptr partial, + std::shared_ptr block, bool is_final, + std::shared_ptr* completion, + std::shared_ptr* rest) { ConsumeWhitespace(&partial); if (partial->size() == 0) { // if partial is empty, don't bother looking for completion @@ -208,9 +246,16 @@ class ParsingChunker : public Chunker { } auto length = ConsumeWholeObject(MultiStringStream({partial, block})); if (length == string_view::npos) { - // straddling object straddles *two* block boundaries. - // retry with larger buffer - return StraddlingTooLarge(); + // no newlines in this block + if (is_final) { + // => it's entirely a completion of partial + *completion = block; + *rest = SliceBuffer(block, 0, 0); + return Status::OK(); + } else { + // => the current object is too large for block size + return StraddlingTooLarge(); + } } auto completion_length = length - partial->size(); *completion = SliceBuffer(block, 0, completion_length); diff --git a/cpp/src/arrow/json/chunker.h b/cpp/src/arrow/json/chunker.h index 0f94d81afd3d..7df1b60a4e73 100644 --- a/cpp/src/arrow/json/chunker.h +++ b/cpp/src/arrow/json/chunker.h @@ -41,23 +41,48 @@ class ARROW_EXPORT Chunker { virtual ~Chunker() = default; /// \brief Carve up a chunk in a block of data to contain only whole objects + /// + /// Post-conditions: + /// - block == whole + partial + /// - `whole` is a valid block of JSON data + /// - `partial` doesn't contain an entire JSON object + /// /// \param[in] block json data to be chunked /// \param[out] whole subrange of block containing whole json objects /// \param[out] partial subrange of block a partial json object - virtual Status Process(const std::shared_ptr& block, - std::shared_ptr* whole, + virtual Status Process(std::shared_ptr block, std::shared_ptr* whole, std::shared_ptr* partial) = 0; /// \brief Carve the completion of a partial object out of a block + /// + /// Post-conditions: + /// - block == completion + rest + /// - `partial + completion` is a valid block of JSON data + /// - `completion` doesn't contain an entire JSON object + /// /// \param[in] partial incomplete json object /// \param[in] block json data /// \param[out] completion subrange of block containing the completion of partial /// \param[out] rest subrange of block containing what completion does not cover - virtual Status ProcessWithPartial(const std::shared_ptr& partial, - const std::shared_ptr& block, + virtual Status ProcessWithPartial(std::shared_ptr partial, + std::shared_ptr block, std::shared_ptr* completion, std::shared_ptr* rest) = 0; + /// \brief Like ProcessWithPartial, but for the lastblock of a file + /// + /// This method allows for a final JSON object without a trailing newline + /// (ProcessWithPartial would return an error in that case). + /// + /// Post-conditions: + /// - block == completion + rest + /// - `partial + completion` is a valid block of JSON data + /// - `completion` doesn't contain an entire JSON object + virtual Status ProcessFinal(std::shared_ptr partial, + std::shared_ptr block, + std::shared_ptr* completion, + std::shared_ptr* rest) = 0; + static std::unique_ptr Make(const ParseOptions& options); protected: diff --git a/cpp/src/arrow/json/chunker_test.cc b/cpp/src/arrow/json/chunker_test.cc index fbe5c0077131..70bca68426cc 100644 --- a/cpp/src/arrow/json/chunker_test.cc +++ b/cpp/src/arrow/json/chunker_test.cc @@ -27,6 +27,7 @@ #include "arrow/json/chunker.h" #include "arrow/json/test_common.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/logging.h" #include "arrow/util/string_view.h" namespace arrow { @@ -39,7 +40,8 @@ namespace json { using util::string_view; template -static std::shared_ptr join(Lines&& lines, std::string delimiter) { +static std::shared_ptr join(Lines&& lines, std::string delimiter, + bool delimiter_at_end = true) { std::shared_ptr joined; BufferVector line_buffers; auto delimiter_buffer = std::make_shared(delimiter); @@ -47,6 +49,9 @@ static std::shared_ptr join(Lines&& lines, std::string delimiter) { line_buffers.push_back(std::make_shared(line)); line_buffers.push_back(delimiter_buffer); } + if (!delimiter_at_end) { + line_buffers.pop_back(); + } ABORT_NOT_OK(ConcatenateBuffers(line_buffers, default_memory_pool(), &joined)); return joined; } @@ -75,15 +80,23 @@ static std::size_t ConsumeWholeObject(std::shared_ptr* buf) { return length; } +void AssertOnlyWholeObjects(Chunker& chunker, std::shared_ptr whole, int* count) { + *count = 0; + while (whole && !WhitespaceOnly(whole)) { + auto buf = whole; + if (ConsumeWholeObject(&whole) == string_view::npos) { + FAIL() << "Not a whole JSON object: '" << buf->ToString() << "'"; + } + ++*count; + } +} + void AssertWholeObjects(Chunker& chunker, const std::shared_ptr& block, int expected_count) { std::shared_ptr whole, partial; ASSERT_OK(chunker.Process(block, &whole, &partial)); - int count = 0; - while (whole && !WhitespaceOnly(whole)) { - if (ConsumeWholeObject(&whole) == string_view::npos) FAIL(); - ++count; - } + int count; + AssertOnlyWholeObjects(chunker, whole, &count); ASSERT_EQ(count, expected_count); } @@ -103,6 +116,39 @@ void AssertChunking(Chunker& chunker, std::shared_ptr buf, int total_cou } } +void AssertChunkingBlockSize(Chunker& chunker, std::shared_ptr buf, + int64_t block_size, int expected_count) { + std::shared_ptr partial = Buffer::FromString({}); + int64_t pos = 0; + int total_count = 0; + while (pos < buf->size()) { + int count; + auto block = SliceBuffer(buf, pos, std::min(block_size, buf->size() - pos)); + pos += block->size(); + std::shared_ptr completion, whole, next_partial; + + if (pos == buf->size()) { + // Last block + ASSERT_OK(chunker.ProcessFinal(partial, block, &completion, &whole)); + } else { + std::shared_ptr starts_with_whole; + ASSERT_OK( + chunker.ProcessWithPartial(partial, block, &completion, &starts_with_whole)); + ASSERT_OK(chunker.Process(starts_with_whole, &whole, &next_partial)); + } + // partial + completion should be a valid JSON block + ASSERT_OK(ConcatenateBuffers({partial, completion}, default_memory_pool(), &partial)); + AssertOnlyWholeObjects(chunker, partial, &count); + total_count += count; + // whole should be a valid JSON block + AssertOnlyWholeObjects(chunker, whole, &count); + total_count += count; + partial = next_partial; + } + ASSERT_EQ(pos, buf->size()); + ASSERT_EQ(total_count, expected_count); +} + void AssertStraddledChunking(Chunker& chunker, const std::shared_ptr& buf) { auto first_half = SliceBuffer(buf, 0, buf->size() / 2); auto second_half = SliceBuffer(buf, buf->size() / 2); @@ -143,11 +189,18 @@ INSTANTIATE_TEST_CASE_P(NoNewlineChunkerTest, BaseChunkerTest, ::testing::Values INSTANTIATE_TEST_CASE_P(ChunkerTest, BaseChunkerTest, ::testing::Values(true)); -constexpr auto object_count = 3; +constexpr int object_count = 4; +constexpr int min_block_size = 28; + static const std::vector& lines() { - static const std::vector l = {R"({"0":"ab","1":"c","2":""})", - R"({"0":"def","1":"","2":"gh"})", - R"({"0":"","1":"ij","2":"kl"})"}; + // clang-format off + static const std::vector l = { + R"({"0":"ab","1":"c","2":""})", + R"({"0":"def","1":"","2":"gh"})", + R"({"0":null})", + R"({"0":"","1":"ij","2":"kl"})" + }; + // clang-format on return l; } @@ -155,6 +208,21 @@ TEST_P(BaseChunkerTest, Basics) { AssertChunking(*chunker_, join(lines(), "\n"), object_count); } +TEST_P(BaseChunkerTest, BlockSizes) { + auto check_block_sizes = [&](std::shared_ptr data) { + for (int64_t block_size = min_block_size; block_size < min_block_size + 30; + ++block_size) { + AssertChunkingBlockSize(*chunker_, data, block_size, object_count); + } + }; + + check_block_sizes(join(lines(), "\n")); + check_block_sizes(join(lines(), "\r\n")); + // Without ending newline + check_block_sizes(join(lines(), "\n", false)); + check_block_sizes(join(lines(), "\r\n", false)); +} + TEST_P(BaseChunkerTest, Empty) { auto empty = std::make_shared("\n"); AssertChunking(*chunker_, empty, 0); diff --git a/cpp/src/arrow/json/options.h b/cpp/src/arrow/json/options.h index f07504188038..03d46ad84304 100644 --- a/cpp/src/arrow/json/options.h +++ b/cpp/src/arrow/json/options.h @@ -46,8 +46,7 @@ struct ARROW_EXPORT ParseOptions { /// Whether objects may be printed across multiple lines (for example pretty-printed) /// - /// If true, parsing may be slower - /// If false, input must end with an empty line + /// If true, parsing may be slower. bool newlines_in_values = false; /// How JSON fields outside of explicit_schema (if given) are treated diff --git a/cpp/src/arrow/json/reader.cc b/cpp/src/arrow/json/reader.cc index 45f3e2e0d9c1..459c107795ec 100644 --- a/cpp/src/arrow/json/reader.cc +++ b/cpp/src/arrow/json/reader.cc @@ -47,7 +47,8 @@ using io::internal::ReadaheadSpooler; namespace json { -class TableReaderImpl : public TableReader { +class TableReaderImpl : public TableReader, + public std::enable_shared_from_this { public: TableReaderImpl(MemoryPool* pool, std::shared_ptr input, const ReadOptions& read_options, const ParseOptions& parse_options, @@ -69,30 +70,37 @@ class TableReaderImpl : public TableReader { return Status::Invalid("Empty JSON file"); } + auto self = shared_from_this(); auto empty = std::make_shared(""); int64_t block_index = 0; - for (std::shared_ptr partial = empty, completion = empty, - starts_with_whole = rh.buffer; - rh.buffer; ++block_index) { - // get completion of partial from previous block - RETURN_NOT_OK(chunker_->ProcessWithPartial(partial, rh.buffer, &completion, - &starts_with_whole)); - - // get all whole objects entirely inside the current buffer - std::shared_ptr whole, next_partial; - RETURN_NOT_OK(chunker_->Process(starts_with_whole, &whole, &next_partial)); - - // launch parse task - task_group_->Append([this, partial, completion, whole, block_index] { - return ParseAndInsert(partial, completion, whole, block_index); - }); + std::shared_ptr partial = empty; + + while (rh.buffer) { + std::shared_ptr block, whole, completion, next_partial; + block = rh.buffer; RETURN_NOT_OK(readahead_.Read(&rh)); - if (rh.buffer == nullptr) { - DCHECK_EQ(string_view(*next_partial).find_first_not_of(" \t\n\r"), - string_view::npos); + + if (!rh.buffer) { + // End of file reached => compute completion from penultimate block + RETURN_NOT_OK(chunker_->ProcessFinal(partial, block, &completion, &whole)); + } else { + std::shared_ptr starts_with_whole; + // Get completion of partial from previous block. + RETURN_NOT_OK(chunker_->ProcessWithPartial(partial, block, &completion, + &starts_with_whole)); + + // Get all whole objects entirely inside the current buffer + RETURN_NOT_OK(chunker_->Process(starts_with_whole, &whole, &next_partial)); } + + // Launch parse task + task_group_->Append([self, partial, completion, whole, block_index] { + return self->ParseAndInsert(partial, completion, whole, block_index); + }); + block_index++; + partial = next_partial; } @@ -123,13 +131,21 @@ class TableReaderImpl : public TableReader { RETURN_NOT_OK(parser->ReserveScalarStorage(partial->size() + completion->size() + whole->size())); - if (completion->size() != 0) { + if (partial->size() != 0 || completion->size() != 0) { std::shared_ptr straddling; - RETURN_NOT_OK(ConcatenateBuffers({partial, completion}, pool_, &straddling)); + if (partial->size() == 0) { + straddling = completion; + } else if (completion->size() == 0) { + straddling = partial; + } else { + RETURN_NOT_OK(ConcatenateBuffers({partial, completion}, pool_, &straddling)); + } RETURN_NOT_OK(parser->Parse(straddling)); } - RETURN_NOT_OK(parser->Parse(whole)); + if (whole->size() != 0) { + RETURN_NOT_OK(parser->Parse(whole)); + } std::shared_ptr parsed; RETURN_NOT_OK(parser->Finish(&parsed)); @@ -143,7 +159,7 @@ class TableReaderImpl : public TableReader { std::unique_ptr chunker_; std::shared_ptr task_group_; ReadaheadSpooler readahead_; - std::unique_ptr builder_; + std::shared_ptr builder_; }; Status TableReader::Make(MemoryPool* pool, std::shared_ptr input, @@ -174,7 +190,7 @@ Status ParseOne(ParseOptions options, std::shared_ptr json, options.unexpected_field_behavior == UnexpectedFieldBehavior::InferType ? GetPromotionGraph() : nullptr; - std::unique_ptr builder; + std::shared_ptr builder; RETURN_NOT_OK(MakeChunkedArrayBuilder(internal::TaskGroup::MakeSerial(), default_memory_pool(), promotion_graph, type, &builder)); diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index 016f49a21a07..82f2c8671e02 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -92,6 +92,22 @@ TEST_P(ReaderTest, Empty) { AssertTablesEqual(*expected_table, *table_); } +TEST_P(ReaderTest, EmptyNoNewlineAtEnd) { + SetUpReader("{}\n{}"); + ASSERT_OK(reader_->Read(&table_)); + + auto expected_table = Table::Make(schema({}), ArrayVector(), 2); + AssertTablesEqual(*expected_table, *table_); +} + +TEST_P(ReaderTest, EmptyManyNewlines) { + SetUpReader("{}\n\r\n{}\n\r\n"); + ASSERT_OK(reader_->Read(&table_)); + + auto expected_table = Table::Make(schema({}), ArrayVector(), 2); + AssertTablesEqual(*expected_table, *table_); +} + TEST_P(ReaderTest, Basics) { parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; auto src = scalars_only_src(); diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index 222597869022..e571894d3e93 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -16,14 +16,41 @@ # under the License. import io +import itertools +import json +import string import unittest +import numpy as np import pytest import pyarrow as pa from pyarrow.json import read_json, ReadOptions, ParseOptions +def generate_col_names(): + # 'a', 'b'... 'z', then 'aa', 'ab'... + letters = string.ascii_lowercase + for letter in letters: + yield letter + for first in letter: + for second in letter: + yield first + second + + +def make_random_json(num_cols=2, num_rows=10, linesep=u'\r\n'): + arr = np.random.RandomState(42).randint(0, 1000, size=(num_cols, num_rows)) + col_names = list(itertools.islice(generate_col_names(), num_cols)) + lines = [] + for row in arr.T: + json_obj = {k: int(v) for (k, v) in zip(col_names, row)} + lines.append(json.dumps(json_obj)) + data = linesep.join(lines).encode() + columns = [pa.array(col, type=pa.int64()) for col in arr] + expected = pa.Table.from_arrays(columns, col_names) + return data, expected + + def test_read_options(): cls = ReadOptions opts = cls() @@ -75,6 +102,37 @@ def test_file_object(self): with pytest.raises(TypeError): self.read_json(sio) + def test_block_sizes(self): + rows = b'{"a": 1}\n{"a": 2}\n{"a": 3}' + read_options = ReadOptions() + parse_options = ParseOptions() + + for data in [rows, rows + b'\n']: + for newlines_in_values in [False, True]: + parse_options.newlines_in_values = newlines_in_values + read_options.block_size = 4 + with pytest.raises(ValueError, + match="try to increase block size"): + self.read_bytes(data, read_options=read_options, + parse_options=parse_options) + + # Validate reader behavior with various block sizes. + # There used to be bugs in this area. + for block_size in range(9, 20): + read_options.block_size = block_size + table = self.read_bytes(data, read_options=read_options, + parse_options=parse_options) + assert table.to_pydict() == {'a': [1, 2, 3]} + + def test_no_newline_at_end(self): + rows = b'{"a": 1,"b": 2, "c": 3}\n{"a": 4,"b": 5, "c": 6}' + table = self.read_bytes(rows) + assert table.to_pydict() == { + 'a': [1, 4], + 'b': [2, 5], + 'c': [3, 6], + } + def test_simple_ints(self): # Infer integer columns rows = b'{"a": 1,"b": 2, "c": 3}\n{"a": 4,"b": 5, "c": 6}\n' @@ -126,6 +184,31 @@ def test_simple_nulls(self): 'e': [None, True, False], } + def test_small_random_json(self): + data, expected = make_random_json(num_cols=2, num_rows=10) + table = self.read_bytes(data) + assert table.schema == expected.schema + assert table.equals(expected) + assert table.to_pydict() == expected.to_pydict() + + def test_stress_block_sizes(self): + # Test a number of small block sizes to stress block stitching + data_base, expected = make_random_json(num_cols=2, num_rows=100) + read_options = ReadOptions() + parse_options = ParseOptions() + + for data in [data_base, data_base.rstrip(b'\r\n')]: + for newlines_in_values in [False, True]: + parse_options.newlines_in_values = newlines_in_values + for block_size in [22, 23, 37]: + read_options.block_size = block_size + table = self.read_bytes(data, read_options=read_options, + parse_options=parse_options) + assert table.schema == expected.schema + if not table.equals(expected): + # Better error output + assert table.to_pydict() == expected.to_pydict() + class TestSerialJSONRead(BaseTestJSONRead, unittest.TestCase): From a4738cf4a93cf8f415e08ac400da4e67273b5da6 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 3 Oct 2019 18:37:33 +0200 Subject: [PATCH 179/201] ARROW-6613: [C++] Minimize usage of boost::filesystem Allow building Arrow core (without tests, without filesystem layer) without boost dependencies. The `minimal_build` example produces something like this: ``` # size /usr/local/lib/libarrow.so text data bss dec hex filename 4001919 84992 1712 4088623 3e632f /usr/local/lib/libarrow.so # ldd /usr/local/lib/libarrow.so linux-vdso.so.1 (0x00007fff45f40000) libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f7089b08000) libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f708976a000) libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f7089552000) libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f7089333000) libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f7088f42000) /lib64/ld-linux-x86-64.so.2 (0x00007f708a478000) ``` Closes #5545 from pitrou/ARROW-6613-boost-fs-optional and squashes the following commits: 4757240b6 Address review comments 6ac957778 ARROW-6613: Minimize usage of boost::filesystem Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/CMakeLists.txt | 15 +- cpp/src/arrow/CMakeLists.txt | 4 + cpp/src/arrow/io/hdfs_internal.cc | 24 +- cpp/src/arrow/ipc/json_integration_test.cc | 29 +- cpp/src/arrow/util/io_util.cc | 465 +++++++++++++++++---- cpp/src/arrow/util/io_util.h | 31 +- cpp/src/arrow/util/io_util_test.cc | 182 ++++++++ 7 files changed, 626 insertions(+), 124 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ea1a5756587c..640d1843521c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -673,12 +673,19 @@ if(ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) endif() -set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS} ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY}) +set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) -list(APPEND ARROW_STATIC_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY}) +# Is boost::filesystem needed? +if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS OR ARROW_FILESYSTEM OR ARROW_HDFS) + set(ARROW_WITH_BOOST_FILESYSTEM ON) -list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS boost_filesystem boost_system boost_regex) + list(APPEND ARROW_SHARED_PRIVATE_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_SYSTEM_LIBRARY}) + + list(APPEND ARROW_STATIC_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY}) + + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS boost_filesystem boost_system) +endif() if(NOT MSVC) list(APPEND ARROW_LINK_LIBS ${CMAKE_DL_LIBS}) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index bf5a77e58857..8aaf5f549fdc 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -298,6 +298,10 @@ if(ARROW_WITH_URIPARSER) list(APPEND ARROW_SRCS util/uri.cc) endif() +if(ARROW_WITH_BOOST_FILESYSTEM) + add_definitions(-DARROW_WITH_BOOST_FILESYSTEM) +endif() + if(ARROW_WITH_BROTLI) add_definitions(-DARROW_WITH_BROTLI) list(APPEND ARROW_SRCS util/compression_brotli.cc) diff --git a/cpp/src/arrow/io/hdfs_internal.cc b/cpp/src/arrow/io/hdfs_internal.cc index ae2c4f01e0c5..0b1bd66e3bc4 100644 --- a/cpp/src/arrow/io/hdfs_internal.cc +++ b/cpp/src/arrow/io/hdfs_internal.cc @@ -41,11 +41,19 @@ #include #endif +#ifdef ARROW_WITH_BOOST_FILESYSTEM #include // NOLINT +#endif #include "arrow/status.h" #include "arrow/util/logging.h" +namespace arrow { +namespace io { +namespace internal { + +#ifdef ARROW_WITH_BOOST_FILESYSTEM + namespace fs = boost::filesystem; #ifndef _WIN32 @@ -282,10 +290,6 @@ static inline void* GetLibrarySymbol(void* handle, const char* symbol) { GetLibrarySymbol(SHIM->handle, "" #SYMBOL_NAME); \ } -namespace arrow { -namespace io { -namespace internal { - static LibHdfsShim libhdfs_shim; static LibHdfsShim libhdfs3_shim; @@ -570,6 +574,18 @@ Status ConnectLibHdfs3(LibHdfsShim** driver) { return shim->GetRequiredSymbols(); } +#else // ARROW_WITH_BOOST_FILESYSTEM + +Status ConnectLibHdfs(LibHdfsShim** driver) { + return Status::NotImplemented("ConnectLibHdfs not available in this Arrow build"); +} + +Status ConnectLibHdfs3(LibHdfsShim** driver) { + return Status::NotImplemented("ConnectLibHdfs3 not available in this Arrow build"); +} + +#endif + } // namespace internal } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/ipc/json_integration_test.cc b/cpp/src/arrow/ipc/json_integration_test.cc index 931ede33d8a9..559880508f02 100644 --- a/cpp/src/arrow/ipc/json_integration_test.cc +++ b/cpp/src/arrow/ipc/json_integration_test.cc @@ -21,14 +21,13 @@ #include // IWYU pragma: keep #include #include +#include #include #include #include #include -#include // NOLINT - #include "arrow/io/file.h" #include "arrow/ipc/json_integration.h" #include "arrow/ipc/reader.h" @@ -38,6 +37,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/io_util.h" DEFINE_string(arrow, "", "Arrow file name"); DEFINE_string(json, "", "JSON file name"); @@ -47,11 +47,10 @@ DEFINE_string( DEFINE_bool(integration, false, "Run in integration test mode"); DEFINE_bool(verbose, true, "Verbose output"); -namespace fs = boost::filesystem; - namespace arrow { class Buffer; +using internal::TemporaryDir; namespace ipc { @@ -228,18 +227,15 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, } } -static std::string temp_path() { - return (fs::temp_directory_path() / fs::unique_path()).string(); -} - class TestJSONIntegration : public ::testing::Test { public: - void SetUp() {} + void SetUp() { ASSERT_OK(TemporaryDir::Make("json-integration-test-", &temp_dir_)); } std::string mkstemp() { - auto path = temp_path(); - tmp_paths_.push_back(path); - return path; + std::stringstream ss; + ss << temp_dir_->path().ToString(); + ss << "file" << ntemp_++; + return ss.str(); } Status WriteJson(const char* data, const std::string& path) { @@ -251,14 +247,11 @@ class TestJSONIntegration : public ::testing::Test { return Status::OK(); } - void TearDown() { - for (const std::string path : tmp_paths_) { - ARROW_UNUSED(std::remove(path.c_str())); - } - } + void TearDown() { temp_dir_.reset(); } protected: - std::vector tmp_paths_; + std::unique_ptr temp_dir_; + int ntemp_ = 1; }; static const char* JSON_EXAMPLE = R"example( diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 45f2afde5f4d..a2c680e77927 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -49,7 +50,9 @@ #define ARROW_WRITE_SHMODE S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH #endif +#ifdef ARROW_WITH_BOOST_FILESYSTEM #include +#endif // ---------------------------------------------------------------------- // file compatibility stuff @@ -178,10 +181,24 @@ Status StdinStream::Read(int64_t nbytes, std::shared_ptr* out) { namespace internal { +#ifdef ARROW_WITH_BOOST_FILESYSTEM namespace bfs = ::boost::filesystem; +#endif namespace { +template +std::basic_string ReplaceChars(std::basic_string s, CharT find, CharT rep) { + if (find != rep) { + for (size_t i = 0; i < s.length(); ++i) { + if (s[i] == find) { + s[i] = rep; + } + } + } + return s; +} + Status StringToNative(const std::string& s, NativePathString* out) { #if _WIN32 std::wstring ws; @@ -193,8 +210,73 @@ Status StringToNative(const std::string& s, NativePathString* out) { return Status::OK(); } +#if _WIN32 +Status NativeToString(const NativePathString& ws, std::string* out) { + std::string s; + RETURN_NOT_OK(::arrow::util::WideStringToUTF8(ws, &s)); + *out = std::move(s); + return Status::OK(); +} +#endif + +#if _WIN32 +const wchar_t kNativeSep = L'\\'; +const wchar_t kGenericSep = L'/'; +const wchar_t* kAllSeps = L"\\/"; +#else +const char kNativeSep = '/'; +const char kGenericSep = '/'; +const char* kAllSeps = "/"; +#endif + +NativePathString NativeSlashes(NativePathString s) { + return ReplaceChars(std::move(s), kGenericSep, kNativeSep); +} + +NativePathString GenericSlashes(NativePathString s) { + return ReplaceChars(std::move(s), kNativeSep, kGenericSep); +} + +NativePathString NativeParent(const NativePathString& s) { + auto last_sep = s.find_last_of(kAllSeps); + if (last_sep == s.length() - 1) { + // Last separator is a trailing separator, skip all trailing separators + // and try again + auto before_last_seps = s.find_last_not_of(kAllSeps); + if (before_last_seps == NativePathString::npos) { + // Only separators in path + return s; + } + last_sep = s.find_last_of(kAllSeps, before_last_seps); + } + if (last_sep == NativePathString::npos) { + // No (other) separator in path + return s; + } + // There may be multiple contiguous separators, skip all of them + auto before_last_seps = s.find_last_not_of(kAllSeps, last_sep); + if (before_last_seps == NativePathString::npos) { + // All separators are at start of string, keep them all + return s.substr(0, last_sep + 1); + } else { + return s.substr(0, before_last_seps + 1); + } +} + +Status ValidatePath(const std::string& s) { + if (s.find_first_of('\0') != std::string::npos) { + return Status::Invalid("Embedded NUL char in path: '", s, "'"); + } + return Status::OK(); +} + } // namespace +#ifdef ARROW_WITH_BOOST_FILESYSTEM + +// NOTE: catching filesystem_error gives more context than system::error_code +// (it includes the file path(s) in the error message) + #define BOOST_FILESYSTEM_TRY try { #define BOOST_FILESYSTEM_CATCH \ } \ @@ -202,25 +284,11 @@ Status StringToNative(const std::string& s, NativePathString* out) { return ToStatus(_err); \ } -// NOTE: catching filesystem_error gives more context than system::error_code -// (it includes the file path(s) in the error message) - static Status ToStatus(const bfs::filesystem_error& err) { return Status::IOError(err.what()); } -static std::string MakeRandomName(int num_chars) { - static const std::string chars = "0123456789abcdefghijklmnopqrstuvwxyz"; - std::random_device gen; - std::uniform_int_distribution dist(0, static_cast(chars.length() - 1)); - - std::string s; - s.reserve(num_chars); - for (int i = 0; i < num_chars; ++i) { - s += chars[dist(gen)]; - } - return s; -} +#endif // ARROW_WITH_BOOST_FILESYSTEM std::string ErrnoMessage(int errnum) { return std::strerror(errnum); } @@ -245,27 +313,32 @@ std::string WinErrorMessage(int errnum) { struct PlatformFilename::Impl { Impl() = default; - explicit Impl(bfs::path p) : path(p.make_preferred()) {} + explicit Impl(NativePathString p) : native_(NativeSlashes(std::move(p))) {} - bfs::path path; + NativePathString native_; + + // '/'-separated + NativePathString generic() const { return GenericSlashes(native_); } + +#ifdef ARROW_WITH_BOOST_FILESYSTEM + bfs::path boost_path() const { return bfs::path(native_); } +#endif }; PlatformFilename::PlatformFilename() : impl_(new Impl{}) {} PlatformFilename::~PlatformFilename() {} -PlatformFilename::PlatformFilename(const Impl& impl) : impl_(new Impl(impl)) {} - -PlatformFilename::PlatformFilename(Impl&& impl) : impl_(new Impl(std::move(impl))) {} +PlatformFilename::PlatformFilename(Impl impl) : impl_(new Impl(std::move(impl))) {} PlatformFilename::PlatformFilename(const PlatformFilename& other) - : PlatformFilename(Impl{other.impl_->path}) {} + : PlatformFilename(Impl{other.impl_->native_}) {} PlatformFilename::PlatformFilename(PlatformFilename&& other) : impl_(std::move(other.impl_)) {} PlatformFilename& PlatformFilename::operator=(const PlatformFilename& other) { - this->impl_.reset(new Impl{other.impl_->path}); + this->impl_.reset(new Impl{other.impl_->native_}); return *this; } @@ -277,15 +350,20 @@ PlatformFilename& PlatformFilename::operator=(PlatformFilename&& other) { PlatformFilename::PlatformFilename(const NativePathString& path) : PlatformFilename(Impl{path}) {} -const NativePathString& PlatformFilename::ToNative() const { - return impl_->path.native(); +bool PlatformFilename::operator==(const PlatformFilename& other) const { + return impl_->native_ == other.impl_->native_; +} + +bool PlatformFilename::operator!=(const PlatformFilename& other) const { + return impl_->native_ != other.impl_->native_; } +const NativePathString& PlatformFilename::ToNative() const { return impl_->native_; } + std::string PlatformFilename::ToString() const { #if _WIN32 - std::wstring ws = impl_->path.generic_wstring(); std::string s; - Status st = ::arrow::util::WideStringToUTF8(ws, &s); + Status st = NativeToString(impl_->generic(), &s); if (!st.ok()) { std::stringstream ss; ss << ""; @@ -293,14 +371,16 @@ std::string PlatformFilename::ToString() const { } return s; #else - return impl_->path.generic_string(); + return impl_->generic(); #endif } +PlatformFilename PlatformFilename::Parent() const { + return PlatformFilename(NativeParent(ToNative())); +} + Status PlatformFilename::FromString(const std::string& file_name, PlatformFilename* out) { - if (file_name.find_first_of('\0') != std::string::npos) { - return Status::Invalid("Embedded NUL char in file name: '", file_name, "'"); - } + RETURN_NOT_OK(ValidatePath(file_name)); NativePathString ns; RETURN_NOT_OK(StringToNative(file_name, &ns)); *out = PlatformFilename(std::move(ns)); @@ -309,38 +389,93 @@ Status PlatformFilename::FromString(const std::string& file_name, PlatformFilena Status PlatformFilename::Join(const std::string& child_name, PlatformFilename* out) const { - NativePathString ns; - RETURN_NOT_OK(StringToNative(child_name, &ns)); - auto path = impl_->path / ns; - *out = PlatformFilename(Impl{std::move(path)}); - return Status::OK(); + PlatformFilename child; + RETURN_NOT_OK(PlatformFilename::FromString(child_name, &child)); + if (impl_->native_.empty() || impl_->native_.back() == kNativeSep) { + *out = PlatformFilename(Impl{impl_->native_ + child.impl_->native_}); + return Status::OK(); + } else { + *out = PlatformFilename(Impl{impl_->native_ + kNativeSep + child.impl_->native_}); + return Status::OK(); + } +} + +Status FileNameFromString(const std::string& file_name, PlatformFilename* out) { + return PlatformFilename::FromString(file_name, out); +} + +// +// Filesystem access routines +// + +namespace { + +Status DoCreateDir(const PlatformFilename& dir_path, bool create_parents, bool* created) { +#ifdef _WIN32 + if (CreateDirectoryW(dir_path.ToNative().c_str(), nullptr)) { + *created = true; + return Status::OK(); + } + int errnum = GetLastError(); + if (errnum == ERROR_ALREADY_EXISTS) { + *created = false; + return Status::OK(); + } + if (create_parents && errnum == ERROR_PATH_NOT_FOUND) { + auto parent_path = dir_path.Parent(); + if (parent_path != dir_path) { + RETURN_NOT_OK(DoCreateDir(parent_path, create_parents, created)); + return DoCreateDir(dir_path, false, created); // Retry + } + } + return Status::IOError("Cannot create directory '", dir_path.ToString(), + "': ", WinErrorMessage(errnum)); +#else + if (mkdir(dir_path.ToNative().c_str(), S_IRWXU | S_IRWXG | S_IRWXO) == 0) { + *created = true; + return Status::OK(); + } + if (errno == EEXIST) { + *created = false; + return Status::OK(); + } + if (create_parents && errno == ENOENT) { + auto parent_path = dir_path.Parent(); + if (parent_path != dir_path) { + RETURN_NOT_OK(DoCreateDir(parent_path, create_parents, created)); + return DoCreateDir(dir_path, false, created); // Retry + } + } + return Status::IOError("Cannot create directory '", dir_path.ToString(), + "': ", ErrnoMessage(errno)); +#endif } +} // namespace + Status CreateDir(const PlatformFilename& dir_path, bool* created) { - bool res; - BOOST_FILESYSTEM_TRY - res = bfs::create_directory(dir_path.impl_->path); - BOOST_FILESYSTEM_CATCH + bool did_create = false; + RETURN_NOT_OK(DoCreateDir(dir_path, false, &did_create)); if (created) { - *created = res; + *created = did_create; } return Status::OK(); } Status CreateDirTree(const PlatformFilename& dir_path, bool* created) { - bool res; - BOOST_FILESYSTEM_TRY - res = bfs::create_directories(dir_path.impl_->path); - BOOST_FILESYSTEM_CATCH + bool did_create = false; + RETURN_NOT_OK(DoCreateDir(dir_path, true, &did_create)); if (created) { - *created = res; + *created = did_create; } return Status::OK(); } +#ifdef ARROW_WITH_BOOST_FILESYSTEM + Status DeleteDirTree(const PlatformFilename& dir_path, bool* deleted) { BOOST_FILESYSTEM_TRY - const auto& path = dir_path.impl_->path; + const auto& path = dir_path.impl()->boost_path(); // XXX There is a race here. auto st = bfs::symlink_status(path); if (st.type() != bfs::file_not_found && st.type() != bfs::directory_file) { @@ -356,7 +491,7 @@ Status DeleteDirTree(const PlatformFilename& dir_path, bool* deleted) { Status DeleteDirContents(const PlatformFilename& dir_path, bool* deleted) { BOOST_FILESYSTEM_TRY - const auto& path = dir_path.impl_->path; + const auto& path = dir_path.impl()->boost_path(); // XXX There is a race here. auto st = bfs::symlink_status(path); if (st.type() == bfs::file_not_found) { @@ -380,40 +515,73 @@ Status DeleteDirContents(const PlatformFilename& dir_path, bool* deleted) { return Status::OK(); } +#else // ARROW_WITH_BOOST_FILESYSTEM + +Status DeleteDirTree(const PlatformFilename& dir_path, bool* deleted) { + return Status::NotImplemented("DeleteDirTree not available in this Arrow build"); +} + +Status DeleteDirContents(const PlatformFilename& dir_path, bool* deleted) { + return Status::NotImplemented("DeleteDirContents not available in this Arrow build"); +} + +#endif + Status DeleteFile(const PlatformFilename& file_path, bool* deleted) { - BOOST_FILESYSTEM_TRY - bool res = false; - const auto& path = file_path.impl_->path; - // XXX There is a race here, and boost::filesystem doesn't allow deleting - // only files and not empty directories. - auto st = bfs::symlink_status(path); - if (!bfs::is_directory(st)) { - res = bfs::remove(path); + bool did_delete = false; +#ifdef _WIN32 + if (DeleteFileW(file_path.ToNative().c_str())) { + did_delete = true; + } else { + int errnum = GetLastError(); + if (errnum != ERROR_FILE_NOT_FOUND) { + return Status::IOError("Cannot delete file '", file_path.ToString(), + "': ", WinErrorMessage(errnum)); + } + } +#else + if (unlink(file_path.ToNative().c_str()) == 0) { + did_delete = true; } else { - return Status::IOError("Cannot delete directory '", path.string(), "'"); + if (errno != ENOENT) { + return Status::IOError("Cannot delete file '", file_path.ToString(), + "': ", ErrnoMessage(errno)); + } } +#endif if (deleted) { - *deleted = res; + *deleted = did_delete; } - BOOST_FILESYSTEM_CATCH return Status::OK(); } Status FileExists(const PlatformFilename& path, bool* out) { - BOOST_FILESYSTEM_TRY - *out = bfs::exists(path.impl_->path); - BOOST_FILESYSTEM_CATCH +#ifdef _WIN32 + if (GetFileAttributesW(path.ToNative().c_str()) != INVALID_FILE_ATTRIBUTES) { + *out = true; + } else { + int errnum = GetLastError(); + if (errnum != ERROR_PATH_NOT_FOUND && errnum != ERROR_FILE_NOT_FOUND) { + return Status::IOError("Failed getting information for path '", path.ToString(), + "': ", WinErrorMessage(errnum)); + } + *out = false; + } +#else + struct stat st; + if (stat(path.ToNative().c_str(), &st) == 0) { + *out = true; + } else { + if (errno != ENOENT && errno != ENOTDIR) { + return Status::IOError("Failed getting information for path '", path.ToString(), + "': ", ErrnoMessage(errno)); + } + *out = false; + } +#endif return Status::OK(); } -// -// File name handling -// - -Status FileNameFromString(const std::string& file_name, PlatformFilename* out) { - return PlatformFilename::FromString(file_name, out); -} - // // Functions for creating file descriptors // @@ -862,6 +1030,28 @@ Status GetEnvVar(const std::string& name, std::string* out) { return GetEnvVar(name.c_str(), out); } +#ifdef _WIN32 +Status GetEnvVar(const std::string& name, NativePathString* out) { + NativePathString w_name; + constexpr int32_t bufsize = 2000; + wchar_t w_str[bufsize]; + + RETURN_NOT_OK(StringToNative(name, &w_name)); + auto res = GetEnvironmentVariableW(w_name.c_str(), w_str, bufsize); + if (res >= bufsize) { + return Status::CapacityError("environment variable value too long"); + } else if (res == 0) { + return Status::KeyError("environment variable undefined"); + } + *out = NativePathString(w_str); + return Status::OK(); +} + +Status GetEnvVar(const char* name, NativePathString* out) { + return GetEnvVar(std::string(name), out); +} +#endif + Status SetEnvVar(const char* name, const char* value) { #ifdef _WIN32 if (SetEnvironmentVariableA(name, value)) { @@ -900,33 +1090,134 @@ Status DelEnvVar(const char* name) { Status DelEnvVar(const std::string& name) { return DelEnvVar(name.c_str()); } -TemporaryDir::TemporaryDir(PlatformFilename&& path) : path_(std::move(path)) {} +// +// Temporary directories +// -TemporaryDir::~TemporaryDir() { - Status st = DeleteDirTree(path_); - if (!st.ok()) { - ARROW_LOG(WARNING) << "When trying to delete temporary directory: " << st; +#ifdef ARROW_WITH_BOOST_FILESYSTEM + +namespace { + +#if _WIN32 +NativePathString GetWindowsDirectoryPath() { + auto size = GetWindowsDirectoryW(nullptr, 0); + ARROW_CHECK_GT(size, 0) << "GetWindowsDirectoryW failed"; + std::vector w_str(size); + size = GetWindowsDirectoryW(w_str.data(), size); + ARROW_CHECK_GT(size, 0) << "GetWindowsDirectoryW failed"; + return {w_str.data(), size}; +} +#endif + +// Return a list of preferred locations for temporary files +std::vector GetPlatformTemporaryDirs() { + struct TempDirSelector { + std::string env_var; + NativePathString path_append; + }; + + std::vector selectors; + NativePathString fallback_tmp; + +#if _WIN32 + selectors = { + {"TMP", L""}, {"TEMP", L""}, {"LOCALAPPDATA", L"Temp"}, {"USERPROFILE", L"Temp"}}; + fallback_tmp = GetWindowsDirectoryPath(); + +#else + selectors = {{"TMPDIR", ""}, {"TMP", ""}, {"TEMP", ""}, {"TEMPDIR", ""}}; +#ifdef __ANDROID__ + fallback_tmp = "/data/local/tmp"; +#else + fallback_tmp = "/tmp"; +#endif +#endif + + std::vector temp_dirs; + for (const auto& sel : selectors) { + NativePathString p; + Status st = GetEnvVar(sel.env_var, &p); + if (st.IsKeyError()) { + // Environment variable absent, skip + continue; + } + if (!st.ok()) { + ARROW_LOG(WARNING) << "Failed getting env var '" << sel.env_var + << "': " << st.ToString(); + continue; + } + if (p.empty()) { + // Environment variable set to empty string, skip + continue; + } + if (sel.path_append.empty()) { + temp_dirs.push_back(p); + } else { + temp_dirs.push_back(p + kNativeSep + sel.path_append); + } + } + temp_dirs.push_back(fallback_tmp); + return temp_dirs; +} + +std::string MakeRandomName(int num_chars) { + static const std::string chars = "0123456789abcdefghijklmnopqrstuvwxyz"; + std::random_device gen; + std::uniform_int_distribution dist(0, static_cast(chars.length() - 1)); + + std::string s; + s.reserve(num_chars); + for (int i = 0; i < num_chars; ++i) { + s += chars[dist(gen)]; } + return s; } +} // namespace Status TemporaryDir::Make(const std::string& prefix, std::unique_ptr* out) { - bfs::path path; std::string suffix = MakeRandomName(8); + NativePathString base_name; + RETURN_NOT_OK(StringToNative(prefix + suffix, &base_name)); + + auto base_dirs = GetPlatformTemporaryDirs(); + DCHECK_NE(base_dirs.size(), 0); + + auto st = Status::OK(); + for (const auto& p : base_dirs) { + PlatformFilename fn(p + kNativeSep + base_name + kNativeSep); + bool created = false; + st = CreateDir(fn, &created); + if (!st.ok()) { + continue; + } + if (!created) { + // XXX Should we retry with another random name? + return Status::IOError("Path already exists: '", fn.ToString(), "'"); + } else { + out->reset(new TemporaryDir(std::move(fn))); + return Status::OK(); + } + } - BOOST_FILESYSTEM_TRY - path = bfs::temp_directory_path() / (prefix + suffix); - path += "/"; - BOOST_FILESYSTEM_CATCH + DCHECK(!st.ok()); + return st; +} - PlatformFilename fn(path.native()); - bool created = false; - RETURN_NOT_OK(CreateDir(fn, &created)); - if (!created) { - // XXX Should we retry? - return Status::IOError("Path already exists: '", fn.ToString(), "'"); +#else // ARROW_WITH_BOOST_FILESYSTEM + +Status TemporaryDir::Make(const std::string& prefix, std::unique_ptr* out) { + return Status::NotImplemented("TemporaryDir not available in this Arrow build"); +} + +#endif + +TemporaryDir::TemporaryDir(PlatformFilename&& path) : path_(std::move(path)) {} + +TemporaryDir::~TemporaryDir() { + Status st = DeleteDirTree(path_); + if (!st.ok()) { + ARROW_LOG(WARNING) << "When trying to delete temporary directory: " << st; } - out->reset(new TemporaryDir(std::move(fn))); - return Status::OK(); } SignalHandler::SignalHandler() : SignalHandler(static_cast(nullptr)) {} diff --git a/cpp/src/arrow/util/io_util.h b/cpp/src/arrow/util/io_util.h index 1464aaf42e43..b0fd11cfc94e 100644 --- a/cpp/src/arrow/util/io_util.h +++ b/cpp/src/arrow/util/io_util.h @@ -113,6 +113,8 @@ using NativePathString = std::string; class ARROW_EXPORT PlatformFilename { public: + struct Impl; + ~PlatformFilename(); PlatformFilename(); PlatformFilename(const PlatformFilename&); @@ -124,24 +126,22 @@ class ARROW_EXPORT PlatformFilename { const NativePathString& ToNative() const; std::string ToString() const; + PlatformFilename Parent() const; + // These functions can fail for character encoding reasons. static Status FromString(const std::string& file_name, PlatformFilename* out); Status Join(const std::string& child_name, PlatformFilename* out) const; + bool operator==(const PlatformFilename& other) const; + bool operator!=(const PlatformFilename& other) const; + + // Made public to avoid the proliferation of friend declarations. + const Impl* impl() const { return impl_.get(); } + private: - struct Impl; std::unique_ptr impl_; - explicit PlatformFilename(const Impl& impl); - explicit PlatformFilename(Impl&& impl); - - // Those functions need access to the embedded path object - friend ARROW_EXPORT Status CreateDir(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status CreateDirTree(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status DeleteDirContents(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status DeleteDirTree(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status DeleteFile(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status FileExists(const PlatformFilename&, bool*); + explicit PlatformFilename(Impl impl); }; ARROW_EXPORT @@ -199,6 +199,12 @@ ARROW_EXPORT Status GetEnvVar(const char* name, std::string* out); ARROW_EXPORT Status GetEnvVar(const std::string& name, std::string* out); +#ifdef _WIN32 +ARROW_EXPORT +Status GetEnvVar(const char* name, NativePathString* out); +ARROW_EXPORT +Status GetEnvVar(const std::string& name, NativePathString* out); +#endif ARROW_EXPORT Status SetEnvVar(const char* name, const char* value); ARROW_EXPORT @@ -219,8 +225,11 @@ class ARROW_EXPORT TemporaryDir { public: ~TemporaryDir(); + /// '/'-terminated path to the temporary dir const PlatformFilename& path() { return path_; } + /// Create a temporary subdirectory in the system temporary dir, + /// named starting with `prefix`. static Status Make(const std::string& prefix, std::unique_ptr* out); private: diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc index 23001f02ffb3..016f715792c4 100644 --- a/cpp/src/arrow/util/io_util_test.cc +++ b/cpp/src/arrow/util/io_util_test.cc @@ -77,6 +77,188 @@ TEST(PlatformFilename, Invalid) { ASSERT_RAISES(Invalid, PlatformFilename::FromString(s, &fn)); } +TEST(PlatformFilename, Join) { + PlatformFilename fn, joined; + ASSERT_OK(PlatformFilename::FromString("a/b", &fn)); + ASSERT_OK(fn.Join("c/d", &joined)); + ASSERT_EQ(joined.ToString(), "a/b/c/d"); +#if _WIN32 + ASSERT_EQ(joined.ToNative(), L"a\\b\\c\\d"); +#else + ASSERT_EQ(joined.ToNative(), "a/b/c/d"); +#endif + + ASSERT_OK(PlatformFilename::FromString("a/b/", &fn)); + ASSERT_OK(fn.Join("c/d", &joined)); + ASSERT_EQ(joined.ToString(), "a/b/c/d"); +#if _WIN32 + ASSERT_EQ(joined.ToNative(), L"a\\b\\c\\d"); +#else + ASSERT_EQ(joined.ToNative(), "a/b/c/d"); +#endif + + ASSERT_OK(PlatformFilename::FromString("", &fn)); + ASSERT_OK(fn.Join("c/d", &joined)); + ASSERT_EQ(joined.ToString(), "c/d"); +#if _WIN32 + ASSERT_EQ(joined.ToNative(), L"c\\d"); +#else + ASSERT_EQ(joined.ToNative(), "c/d"); +#endif + +#if _WIN32 + ASSERT_OK(PlatformFilename::FromString("a\\b", &fn)); + ASSERT_OK(fn.Join("c\\d", &joined)); + ASSERT_EQ(joined.ToString(), "a/b/c/d"); + ASSERT_EQ(joined.ToNative(), L"a\\b\\c\\d"); + + ASSERT_OK(PlatformFilename::FromString("a\\b\\", &fn)); + ASSERT_OK(fn.Join("c\\d", &joined)); + ASSERT_EQ(joined.ToString(), "a/b/c/d"); + ASSERT_EQ(joined.ToNative(), L"a\\b\\c\\d"); +#endif +} + +TEST(PlatformFilename, JoinInvalid) { + PlatformFilename fn, joined; + ASSERT_OK(PlatformFilename::FromString("a/b", &fn)); + std::string s = "foo"; + s += '\x00'; + ASSERT_RAISES(Invalid, fn.Join(s, &joined)); +} + +TEST(PlatformFilename, Parent) { + PlatformFilename fn; + + // Relative + ASSERT_OK(PlatformFilename::FromString("ab/cd", &fn)); + ASSERT_EQ(fn.ToString(), "ab/cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); +#if _WIN32 + ASSERT_OK(PlatformFilename::FromString("ab/cd\\ef", &fn)); + ASSERT_EQ(fn.ToString(), "ab/cd/ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab/cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); +#endif + + // Absolute + ASSERT_OK(PlatformFilename::FromString("/ab/cd/ef", &fn)); + ASSERT_EQ(fn.ToString(), "/ab/cd/ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab/cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/"); +#if _WIN32 + ASSERT_OK(PlatformFilename::FromString("\\ab\\cd/ef", &fn)); + ASSERT_EQ(fn.ToString(), "/ab/cd/ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab/cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/"); +#endif + + // Empty + ASSERT_OK(PlatformFilename::FromString("", &fn)); + ASSERT_EQ(fn.ToString(), ""); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), ""); + + // Multiple separators, relative + ASSERT_OK(PlatformFilename::FromString("ab//cd///ef", &fn)); + ASSERT_EQ(fn.ToString(), "ab//cd///ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab//cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); +#if _WIN32 + ASSERT_OK(PlatformFilename::FromString("ab\\\\cd\\\\\\ef", &fn)); + ASSERT_EQ(fn.ToString(), "ab//cd///ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab//cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab"); +#endif + + // Multiple separators, absolute + ASSERT_OK(PlatformFilename::FromString("//ab//cd///ef", &fn)); + ASSERT_EQ(fn.ToString(), "//ab//cd///ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//ab//cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//"); +#if _WIN32 + ASSERT_OK(PlatformFilename::FromString("\\\\ab\\cd\\ef", &fn)); + ASSERT_EQ(fn.ToString(), "//ab/cd/ef"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//ab/cd"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//ab"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "//"); +#endif + + // Trailing slashes + ASSERT_OK(PlatformFilename::FromString("/ab/cd/ef/", &fn)); + ASSERT_EQ(fn.ToString(), "/ab/cd/ef/"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab/cd"); + ASSERT_OK(PlatformFilename::FromString("/ab/cd/ef//", &fn)); + ASSERT_EQ(fn.ToString(), "/ab/cd/ef//"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab/cd"); + ASSERT_OK(PlatformFilename::FromString("ab/", &fn)); + ASSERT_EQ(fn.ToString(), "ab/"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab/"); + ASSERT_OK(PlatformFilename::FromString("ab//", &fn)); + ASSERT_EQ(fn.ToString(), "ab//"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab//"); +#if _WIN32 + ASSERT_OK(PlatformFilename::FromString("\\ab\\cd\\ef\\", &fn)); + ASSERT_EQ(fn.ToString(), "/ab/cd/ef/"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab/cd"); + ASSERT_OK(PlatformFilename::FromString("\\ab\\cd\\ef\\\\", &fn)); + ASSERT_EQ(fn.ToString(), "/ab/cd/ef//"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "/ab/cd"); + ASSERT_OK(PlatformFilename::FromString("ab\\", &fn)); + ASSERT_EQ(fn.ToString(), "ab/"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab/"); + ASSERT_OK(PlatformFilename::FromString("ab\\\\", &fn)); + ASSERT_EQ(fn.ToString(), "ab//"); + fn = fn.Parent(); + ASSERT_EQ(fn.ToString(), "ab//"); +#endif +} + TEST(CreateDirDeleteDir, Basics) { const std::string BASE = "xxx-io-util-test-dir"; bool created, deleted; From 227a33f3af7e2921f1837d03643cbecd5a5096ad Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 3 Oct 2019 13:42:54 -0400 Subject: [PATCH 180/201] ARROW-6494: [C++][Dataset] Implement PartitionSchemes Implements SimplePartitonScheme (ignores paths in favor of a preconfigured partition expression) and HivePartitionScheme (which parses paths containing embedded equality expressions) Closes #5443 from bkietz/6494-Dataset-Implement-basic-P and squashes the following commits: e385fdcce fix test class; should probably just use unique_ptr f3acb2128 add Status return + out arg overload of Parse 6d7bf89d2 address review comments dd9c2ee10 add cpp/.clangd to gitignore ca1e50647 restrict MakeScalar to ScalarType::ValueType a7a2e7d73 even more explicit StringScalar constructor for msvc 9e57edf0c amend cpplint to ignore alt_tokens preceding parens f85f36823 make StringScalar constructor more explicit for msvc a0ea783a0 revert composability changes af984240f rewrite scalar factories to avoid implicit conversion a499cf8c2 refactor vector and_, or_ to avoid recursion 1ed360818 rename to ConstantPartitionScheme f2b7c9f23 add non equality conjunction partition examples 86485225d add convenience Parse() overload, more Hive tests 52582ef6b lint fixes 57106721f refactor PartitionScheme to allow chaining cf1c1b58b extract scalar factories from ScalarExpression a1a12b617 move optional detection/handling to stl.h ec65f665c refactor partition tests to share more code 01be61c3e first pass at HivePartitionScheme 570fa2246 ARROW-6494: Implement PartitionSchemes Authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- .gitignore | 1 + cpp/build-support/cpplint.py | 2 + cpp/src/arrow/compare.cc | 3 +- cpp/src/arrow/dataset/CMakeLists.txt | 3 + cpp/src/arrow/dataset/file_test.cc | 1 + cpp/src/arrow/dataset/filter.cc | 131 ++++++++------ cpp/src/arrow/dataset/filter.h | 45 ++--- cpp/src/arrow/dataset/filter_test.cc | 10 +- cpp/src/arrow/dataset/partition.cc | 94 ++++++++++ cpp/src/arrow/dataset/partition.h | 226 ++++++++++-------------- cpp/src/arrow/dataset/partition_test.cc | 226 ++++++++++++++++++++++++ cpp/src/arrow/result.h | 19 +- cpp/src/arrow/result_test.cc | 80 +++++++-- cpp/src/arrow/scalar.cc | 98 +++++++--- cpp/src/arrow/scalar.h | 203 ++++++++++++++++----- cpp/src/arrow/scalar_test.cc | 137 +++++++++++--- cpp/src/arrow/stl.h | 48 +++-- cpp/src/arrow/stl_test.cc | 6 +- cpp/src/arrow/type_traits.h | 42 +---- 19 files changed, 987 insertions(+), 388 deletions(-) create mode 100644 cpp/src/arrow/dataset/partition.cc create mode 100644 cpp/src/arrow/dataset/partition_test.cc diff --git a/.gitignore b/.gitignore index 8f1de319b540..9df5101995f2 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ perf.data perf.data.old cpp/.idea/ +cpp/.clangd/ cpp/apidoc/xml/ docs/example.gz docs/example1.dat diff --git a/cpp/build-support/cpplint.py b/cpp/build-support/cpplint.py index 62a22d603be5..81a3b2aed8c9 100755 --- a/cpp/build-support/cpplint.py +++ b/cpp/build-support/cpplint.py @@ -3103,6 +3103,8 @@ def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and + not Search(r'\b(' + '|'.join(_ALT_TOKEN_REPLACEMENT.keys()) + r')\b\s+\(', + fncall) and not Search(r'\bcase\s+\(', fncall)): # TODO(unknown): Space after an operator function seem to be a common # error, silence those for now by restricting them to highest verbosity. diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index b751c4ee10aa..eae54e7092f4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -886,8 +886,7 @@ class ScalarEqualsVisitor { Status Visit(const MapScalar& left) { const auto& right = checked_cast(right_); - result_ = internal::SharedPtrEquals(left.keys, right.keys) && - internal::SharedPtrEquals(left.items, right.items); + result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); } diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt index 442e9e6587bc..0a99846f3a29 100644 --- a/cpp/src/arrow/dataset/CMakeLists.txt +++ b/cpp/src/arrow/dataset/CMakeLists.txt @@ -28,7 +28,9 @@ set(ARROW_DATASET_SRCS discovery.cc file_base.cc filter.cc + partition.cc scanner.cc) + set(ARROW_DATASET_LINK_STATIC arrow_static) set(ARROW_DATASET_LINK_SHARED arrow_shared) @@ -99,6 +101,7 @@ if(NOT WIN32) add_arrow_dataset_test(discovery_test) add_arrow_dataset_test(file_test) add_arrow_dataset_test(filter_test) + add_arrow_dataset_test(partition_test) add_arrow_dataset_test(scanner_test) if(ARROW_PARQUET) diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc index be6af4cbbfc0..806dcca89424 100644 --- a/cpp/src/arrow/dataset/file_test.cc +++ b/cpp/src/arrow/dataset/file_test.cc @@ -24,6 +24,7 @@ #include #include "arrow/dataset/api.h" +#include "arrow/dataset/partition.h" #include "arrow/dataset/test_util.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/test_util.h" diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc index bcdee2897f1f..9c059d1c67ee 100644 --- a/cpp/src/arrow/dataset/filter.cc +++ b/cpp/src/arrow/dataset/filter.cc @@ -19,9 +19,11 @@ #include #include +#include #include #include #include +#include #include "arrow/buffer.h" #include "arrow/buffer_builder.h" @@ -45,7 +47,11 @@ Result ScalarExpression::Evaluate(compute::FunctionContext* ctx, return value_; } -Datum NullDatum() { return Datum(std::make_shared()); } +inline std::shared_ptr NullExpression() { + return std::make_shared(std::make_shared()); +} + +inline Datum NullDatum() { return Datum(std::make_shared()); } Result FieldExpression::Evaluate(compute::FunctionContext* ctx, const RecordBatch& batch) const { @@ -187,23 +193,6 @@ Result ComparisonExpression::Evaluate(compute::FunctionContext* ctx, return std::move(out); } -std::shared_ptr ScalarExpression::Make(std::string value) { - return std::make_shared( - std::make_shared(Buffer::FromString(std::move(value)))); -} - -std::shared_ptr ScalarExpression::Make(const char* value) { - return std::make_shared( - std::make_shared(Buffer::Wrap(value, std::strlen(value)))); -} - -std::shared_ptr ScalarExpression::MakeNull( - const std::shared_ptr& type) { - std::shared_ptr null; - DCHECK_OK(arrow::MakeNullScalar(type, &null)); - return Make(std::move(null)); -} - struct Comparison { enum type { LESS, @@ -384,7 +373,7 @@ Result> ComparisonExpression::Assume( if (simplified->IsNull()) { // some subexpression of given is always null, return null - return ScalarExpression::MakeNull(boolean()); + return NullExpression(); } bool trivial; @@ -404,11 +393,11 @@ Result> ComparisonExpression::Assume( } if (simplify_to_always) { - return ScalarExpression::Make(true); + return scalar(true); } if (simplify_to_never) { - return ScalarExpression::Make(false); + return scalar(false); } return Copy(); @@ -420,7 +409,7 @@ Result> ComparisonExpression::Assume( auto simplified = Copy(); for (const auto& operand : {given_and.left_operand(), given_and.right_operand()}) { if (simplified->IsNull()) { - return ScalarExpression::MakeNull(boolean()); + return NullExpression(); } if (simplified->IsTrivialCondition()) { @@ -469,11 +458,11 @@ Result> ComparisonExpression::AssumeGivenComparison( if (cmp == Comparison::NULL_) { // the RHS of e or given was null - return ScalarExpression::MakeNull(boolean()); + return NullExpression(); } - static auto always = ScalarExpression::Make(true); - static auto never = ScalarExpression::Make(false); + static auto always = scalar(true); + static auto never = scalar(false); using compute::CompareOperator; @@ -619,7 +608,7 @@ Result> AndExpression::Assume(const Expression& give // if either operand is trivially null then so is this AND if (left_operand->IsNull() || right_operand->IsNull()) { - return ScalarExpression::MakeNull(boolean()); + return NullExpression(); } bool left_trivial, right_trivial; @@ -637,7 +626,7 @@ Result> AndExpression::Assume(const Expression& give (right_is_trivial && right_trivial == false)) { // FIXME(bkietz) if left is false and right is a column conaining nulls, this is an // error because we should be yielding null there rather than false - return ScalarExpression::Make(false); + return scalar(false); } // at least one of the operands is trivially true; return the other operand @@ -650,7 +639,7 @@ Result> OrExpression::Assume(const Expression& given // if either operand is trivially null then so is this OR if (left_operand->IsNull() || right_operand->IsNull()) { - return ScalarExpression::MakeNull(boolean()); + return NullExpression(); } bool left_trivial, right_trivial; @@ -668,7 +657,7 @@ Result> OrExpression::Assume(const Expression& given (right_is_trivial && right_trivial == true)) { // FIXME(bkietz) if left is true but right is a column conaining nulls, this is an // error because we should be yielding null there rather than true - return ScalarExpression::Make(true); + return scalar(true); } // at least one of the operands is trivially false; return the other operand @@ -679,12 +668,12 @@ Result> NotExpression::Assume(const Expression& give ARROW_ASSIGN_OR_RAISE(auto operand, operand_->Assume(given)); if (operand->IsNull()) { - return ScalarExpression::MakeNull(boolean()); + return NullExpression(); } bool trivial; if (operand->IsTrivialCondition(&trivial)) { - return ScalarExpression::Make(!trivial); + return scalar(!trivial); } return Copy(); @@ -715,34 +704,46 @@ std::string OperatorName(compute::CompareOperator op) { return ""; } +// TODO(bkietz) extract this to Scalar::ToString() +struct ScalarExpressionToString { + Status Visit(const BooleanType&) { + return Finish(CastValue().value ? "true" : "false"); + } + + template + enable_if_number Visit(const T&) { + return Finish(std::to_string(CastValue().value)); + } + + Status Visit(const StringType&) { + return Finish(CastValue().value->ToString()); + } + + Status Visit(const DataType&) { return Finish("TODO(bkietz)"); } + + Status Finish(std::string repr) { + *repr_ = std::move(repr); + return Status::OK(); + } + + template + const typename TypeTraits::ScalarType& CastValue() { + return checked_cast::ScalarType&>(value_); + } + + const Scalar& value_; + std::string* repr_; +}; + std::string ScalarExpression::ToString() const { if (!value_->is_valid) { return "scalar<" + value_->type->ToString() + ", null>()"; } - std::string value; - switch (value_->type->id()) { - case Type::BOOL: - value = checked_cast(*value_).value ? "true" : "false"; - break; - case Type::INT32: - value = std::to_string(checked_cast(*value_).value); - break; - case Type::INT64: - value = std::to_string(checked_cast(*value_).value); - break; - case Type::DOUBLE: - value = std::to_string(checked_cast(*value_).value); - break; - case Type::STRING: - value = checked_cast(*value_).value->ToString(); - break; - default: - value = "TODO(bkietz)"; - break; - } - - return "scalar<" + value_->type->ToString() + ">(" + value + ")"; + std::string repr; + ScalarExpressionToString impl{*value_, &repr}; + DCHECK_OK(VisitTypeInline(*value_->type, &impl)); + return "scalar<" + value_->type->ToString() + ">(" + repr + ")"; } static std::string EulerNotation(std::string fn, const ExpressionVector& operands) { @@ -865,11 +866,33 @@ std::shared_ptr and_(std::shared_ptr lhs, return std::make_shared(std::move(lhs), std::move(rhs)); } +std::shared_ptr and_(const ExpressionVector& subexpressions) { + if (subexpressions.size() == 0) { + return scalar(true); + } + return std::accumulate( + subexpressions.begin(), subexpressions.end(), std::shared_ptr(), + [](std::shared_ptr acc, const std::shared_ptr& next) { + return acc == nullptr ? next : and_(std::move(acc), next); + }); +} + std::shared_ptr or_(std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(std::move(lhs), std::move(rhs)); } +std::shared_ptr or_(const ExpressionVector& subexpressions) { + if (subexpressions.size() == 0) { + return scalar(false); + } + return std::accumulate( + subexpressions.begin(), subexpressions.end(), std::shared_ptr(), + [](std::shared_ptr acc, const std::shared_ptr& next) { + return acc == nullptr ? next : or_(std::move(acc), next); + }); +} + std::shared_ptr not_(std::shared_ptr operand) { return std::make_shared(std::move(operand)); } diff --git a/cpp/src/arrow/dataset/filter.h b/cpp/src/arrow/dataset/filter.h index 3f122c18ea4c..a06b93687e5f 100644 --- a/cpp/src/arrow/dataset/filter.h +++ b/cpp/src/arrow/dataset/filter.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/compare.h" @@ -295,30 +296,6 @@ class ARROW_DS_EXPORT ScalarExpression final : public Expression { bool Equals(const Expression& other) const override; - static std::shared_ptr Make(bool value) { - return std::make_shared(std::make_shared(value)); - } - - template - static typename std::enable_if::value || - std::is_floating_point::value, - std::shared_ptr>::type - Make(T value) { - using ScalarType = typename CTypeTraits::ScalarType; - return std::make_shared(std::make_shared(value)); - } - - static std::shared_ptr Make(std::string value); - - static std::shared_ptr Make(const char* value); - - static std::shared_ptr Make(std::shared_ptr value) { - return std::make_shared(std::move(value)); - } - - static std::shared_ptr MakeNull( - const std::shared_ptr& type); - Result> Validate(const Schema& schema) const override; Result Evaluate(compute::FunctionContext* ctx, @@ -357,17 +334,30 @@ class ARROW_DS_EXPORT FieldExpression final : public Expression { ARROW_DS_EXPORT std::shared_ptr and_(std::shared_ptr lhs, std::shared_ptr rhs); +ARROW_DS_EXPORT std::shared_ptr and_(const ExpressionVector& subexpressions); + ARROW_DS_EXPORT AndExpression operator&&(const Expression& lhs, const Expression& rhs); ARROW_DS_EXPORT std::shared_ptr or_(std::shared_ptr lhs, std::shared_ptr rhs); +ARROW_DS_EXPORT std::shared_ptr or_(const ExpressionVector& subexpressions); + ARROW_DS_EXPORT OrExpression operator||(const Expression& lhs, const Expression& rhs); ARROW_DS_EXPORT std::shared_ptr not_(std::shared_ptr operand); ARROW_DS_EXPORT NotExpression operator!(const Expression& rhs); +inline std::shared_ptr scalar(std::shared_ptr value) { + return std::make_shared(std::move(value)); +} + +template +auto scalar(T&& value) -> decltype(scalar(MakeScalar(std::forward(value)))) { + return scalar(MakeScalar(std::forward(value))); +} + #define COMPARISON_FACTORY(NAME, FACTORY_NAME, OP) \ inline std::shared_ptr FACTORY_NAME( \ const std::shared_ptr& lhs, \ @@ -379,7 +369,7 @@ ARROW_DS_EXPORT NotExpression operator!(const Expression& rhs); template \ ComparisonExpression operator OP(const FieldExpression& lhs, T&& rhs) { \ return ComparisonExpression(compute::CompareOperator::NAME, lhs.Copy(), \ - ScalarExpression::Make(std::forward(rhs))); \ + scalar(std::forward(rhs))); \ } COMPARISON_FACTORY(EQUAL, equal, ==) COMPARISON_FACTORY(NOT_EQUAL, not_equal, !=) @@ -389,11 +379,6 @@ COMPARISON_FACTORY(LESS, less, <) COMPARISON_FACTORY(LESS_EQUAL, less_equal, <=) #undef COMPARISON_FACTORY -template -auto scalar(T&& value) -> decltype(ScalarExpression::Make(std::forward(value))) { - return ScalarExpression::Make(std::forward(value)); -} - inline std::shared_ptr field_ref(std::string name) { return std::make_shared(std::move(name)); } diff --git a/cpp/src/arrow/dataset/filter_test.cc b/cpp/src/arrow/dataset/filter_test.cc index a0783e2fb23f..8c92b7b92c9d 100644 --- a/cpp/src/arrow/dataset/filter_test.cc +++ b/cpp/src/arrow/dataset/filter_test.cc @@ -62,8 +62,8 @@ class ExpressionsTest : public ::testing::Test { ASSERT_TRUE(expr.right_operand()->Equals(rhs)); } - std::shared_ptr always = ScalarExpression::Make(true); - std::shared_ptr never = ScalarExpression::Make(false); + std::shared_ptr always = scalar(true); + std::shared_ptr never = scalar(false); }; TEST_F(ExpressionsTest, Equality) { @@ -106,8 +106,8 @@ TEST_F(ExpressionsTest, SimplificationAgainstCompoundCondition) { } TEST_F(ExpressionsTest, SimplificationToNull) { - auto null = ScalarExpression::MakeNull(boolean()); - auto null32 = ScalarExpression::MakeNull(int32()); + auto null = scalar(std::shared_ptr(new BooleanScalar)); + auto null32 = scalar(std::shared_ptr(new Int32Scalar)); AssertSimplifiesTo(*equal(field_ref("b"), null32), "b"_ == 3, *null); AssertSimplifiesTo(*not_equal(field_ref("b"), null32), "b"_ == 3, *null); @@ -191,7 +191,7 @@ TEST_F(FilterTest, Trivial) { {"a": 0, "b": 1.0, "in": 0} ])"); - AssertFilter(*ScalarExpression::MakeNull(boolean()), + AssertFilter(*scalar(std::shared_ptr(new BooleanScalar)), {field("a", int32()), field("b", float64())}, R"([ {"a": 0, "b": -0.1, "in": null}, {"a": 0, "b": 0.3, "in": null}, diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc new file mode 100644 index 000000000000..e30a017a05ae --- /dev/null +++ b/cpp/src/arrow/dataset/partition.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/dataset/partition.h" + +#include +#include + +#include "arrow/dataset/filter.h" +#include "arrow/dataset/scanner.h" +#include "arrow/filesystem/path_util.h" +#include "arrow/scalar.h" +#include "arrow/util/iterator.h" +#include "arrow/util/stl.h" + +namespace arrow { +namespace dataset { + +Result> ConvertPartitionKeys( + const std::vector& keys, const Schema& schema) { + ExpressionVector subexpressions; + + for (const auto& key : keys) { + auto field = schema.GetFieldByName(key.name); + if (field == nullptr) { + continue; + } + + std::shared_ptr converted; + RETURN_NOT_OK(Scalar::Parse(field->type(), key.value, &converted)); + subexpressions.push_back(equal(field_ref(field->name()), scalar(converted))); + } + + return and_(subexpressions); +} + +Result> ConstantPartitionScheme::Parse( + const std::string& path) const { + return expression_; +} + +Result> SchemaPartitionScheme::Parse( + const std::string& path) const { + auto segments = fs::internal::SplitAbstractPath(path); + if (static_cast(segments.size()) < schema_->num_fields()) { + return Status::Invalid("path had too few segments (", segments.size(), ") to parse ", + schema_->num_fields(), " fields"); + } + + std::vector keys(schema_->num_fields()); + for (int i = 0; i < schema_->num_fields(); ++i) { + keys[i].name = schema_->field(i)->name(); + keys[i].value = std::move(segments[i]); + } + + return ConvertPartitionKeys(keys, *schema_); +} + +std::vector HivePartitionScheme::GetUnconvertedKeys( + const std::string& path) const { + auto segments = fs::internal::SplitAbstractPath(path); + + std::vector keys; + for (const auto& segment : segments) { + std::smatch matches; + static std::regex hive_style("^([^=]+)=(.*)$"); + if (std::regex_match(segment, matches, hive_style) && matches.size() == 3) { + keys.push_back({matches[1].str(), matches[2].str()}); + } + } + return keys; +} + +Result> HivePartitionScheme::Parse( + const std::string& path) const { + return ConvertPartitionKeys(GetUnconvertedKeys(path), *schema_); +} + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index d6adf157432b..a7ecf8d04296 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -17,12 +17,16 @@ #pragma once +#include #include +#include #include +#include #include #include #include "arrow/dataset/dataset.h" +#include "arrow/dataset/filter.h" #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" @@ -30,65 +34,33 @@ namespace arrow { namespace dataset { // ---------------------------------------------------------------------- -// Computing partition values - -// TODO(wesm): API for computing partition keys derived from raw -// values. For example, year(value) or hash_function(value) instead of -// simply value, so a dataset with a timestamp column might group all -// data with year 2009 in the same partition - -// /// \brief -// class ScalarTransform { -// public: -// virtual Status Transform(const std::shared_ptr& input, -// std::shared_ptr* output) const = 0; -// }; - -// class PartitionField { -// public: - -// private: -// std::string field_name_; -// }; - -// ---------------------------------------------------------------------- -// Partition identifiers - -/// \brief A partition level identifier which can be used -/// -/// TODO(wesm): Is this general enough? What other kinds of partition -/// keys exist and do we need to support them? -class PartitionKey { - public: - const std::vector& fields() const { return fields_; } - const std::vector>& values() const { return values_; } +// Partition schemes - private: - std::vector fields_; - std::vector> values_; +struct ARROW_DS_EXPORT UnconvertedKey { + std::string name, value; }; -/// \brief Intermediate data structure for data parsed from a string -/// partition identifier. +/// \brief Helper function for the common case of combining partition information +/// consisting of equality expressions into a single conjunction expression. +/// Fields referenced in keys but absent from schema will be ignored. +ARROW_DS_EXPORT +Result> ConvertPartitionKeys( + const std::vector& keys, const Schema& schema); + +/// \brief Interface for parsing partition expressions from string partition +/// identifiers. /// -/// For example, the identifier "foo=5" might be parsed with a single -/// "foo" field and the value 5. A more complex identifier might be -/// written as "foo=5,bar=2", which would yield two fields and two -/// values. +/// For example, the identifier "foo=5" might be parsed to an equality expression +/// between the "foo" field and the value 5. /// /// Some partition schemes may store the field names in a metadata /// store instead of in file paths, for example /// dataset_root/2009/11/... could be used when the partition fields /// are "year" and "month" -struct PartitionKeyData { - std::vector fields; - std::vector> values; -}; - -// ---------------------------------------------------------------------- -// Partition schemes - -/// \brief +/// +/// Paths are consumed from left to right. Paths must be relative to +/// the root of a partition; path prefixes must be removed before passing +/// the path to a scheme for parsing. class ARROW_DS_EXPORT PartitionScheme { public: virtual ~PartitionScheme() = default; @@ -96,119 +68,99 @@ class ARROW_DS_EXPORT PartitionScheme { /// \brief The name identifying the kind of partition scheme virtual std::string name() const = 0; - virtual bool PathMatchesScheme(const std::string& path) const = 0; + /// \brief Parse a path into a partition expression + /// + /// \param[in] path the partition identifier to parse + /// \return the parsed expression + virtual Result> Parse(const std::string& path) const = 0; - virtual Status ParseKey(const std::string& path, PartitionKeyData* out) const = 0; + /// \brief Status return + out arg overload + Status Parse(const std::string& path, std::shared_ptr* out) const { + return Parse(path).Value(out); + } }; -/// \brief Multi-level, directory based partitioning scheme -/// originating from Apache Hive with all data files stored in the -/// leaf directories. Data is partitioned by static values of a -/// particular column in the schema. Partition keys are represented in -/// the form $key=$value in directory names -class ARROW_DS_EXPORT HivePartitionScheme : public PartitionScheme { +/// \brief Trivial partition scheme which yields an expression provided on construction. +class ARROW_DS_EXPORT ConstantPartitionScheme : public PartitionScheme { public: - /// \brief Return true if path - bool PathMatchesScheme(const std::string& path) const override; - - virtual Status ParseKey(const std::string& path, PartitionKeyData* out) const = 0; -}; + explicit ConstantPartitionScheme(std::shared_ptr expr) + : expression_(std::move(expr)) {} -// ---------------------------------------------------------------------- -// - -// Partitioned datasets come in different forms. Here is an example of -// a Hive-style partitioned dataset: -// -// dataset_root/ -// key1=$k1_v1/ -// key2=$k2_v1/ -// 0.parquet -// 1.parquet -// 2.parquet -// 3.parquet -// key2=$k2_v2/ -// 0.parquet -// 1.parquet -// key1=$k1_v2/ -// key2=$k2_v1/ -// 0.parquet -// 1.parquet -// key2=$k2_v2/ -// 0.parquet -// 1.parquet -// 2.parquet -// -// In this case, the dataset has 11 fragments (11 files) to be -// scanned, or potentially more if it is configured to split Parquet -// files at the row group level - -class ARROW_DS_EXPORT Partition : public DataSource { - public: - std::string type() const override; + std::string name() const override { return "constant_partition_scheme"; } - /// \brief The key for this partition source, may be nullptr, - /// e.g. for the top-level partitioned source container - virtual const PartitionKey* key() const = 0; + Result> Parse(const std::string& path) const override; - virtual DataFragmentIterator GetFragments(const Selector& selector) = 0; + private: + std::shared_ptr expression_; }; -/// \brief Simple implementation of Partition, which consists of a -/// partition identifier, subpartitions, and some data fragments -class ARROW_DS_EXPORT SimplePartition : public Partition { +/// \brief SchemaPartitionScheme parses one segment of a path for each field in its +/// schema. All fields are required, so paths passed to SchemaPartitionScheme::Parse +/// must contain segments for each field. +/// +/// For example given schema the path "/2009/11" would be +/// parsed to ("year"_ == 2009 and "month"_ == 11) +class ARROW_DS_EXPORT SchemaPartitionScheme : public PartitionScheme { public: - SimplePartition(std::unique_ptr partition_key, - DataFragmentVector&& data_fragments, PartitionVector&& subpartitions, - std::shared_ptr scan_options = NULLPTR) - : key_(std::move(partition_key)), - data_fragments_(std::move(data_fragments)), - subpartitions_(std::move(subpartitions)), - scan_options_(scan_options) {} + explicit SchemaPartitionScheme(std::shared_ptr schema) + : schema_(std::move(schema)) {} - const PartitionKey* key() const override { return key_.get(); } + std::string name() const override { return "schema_partition_scheme"; } - int num_subpartitions() const { return static_cast(subpartitions_.size()); } + Result> Parse(const std::string& path) const override; - int num_data_fragments() const { return static_cast(data_fragments__.size()); } + const std::shared_ptr& schema() { return schema_; } - const PartitionVector& subpartitions() const { return subpartitions_; } - const DataFragmentVector& data_fragments() const { return data_fragments_; } + protected: + std::shared_ptr schema_; +}; - DataFragmentIterator GetFragments(const FilterVector& filters) override; +/// \brief Multi-level, directory based partitioning scheme +/// originating from Apache Hive with all data files stored in the +/// leaf directories. Data is partitioned by static values of a +/// particular column in the schema. Partition keys are represented in +/// the form $key=$value in directory names. +/// Field order is ignored, as are missing or unrecognized field names. +/// +/// For example given schema the path +/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321) +class ARROW_DS_EXPORT HivePartitionScheme : public PartitionScheme { + public: + explicit HivePartitionScheme(std::shared_ptr schema) + : schema_(std::move(schema)) {} - private: - std::unique_ptr key_; + std::string name() const override { return "hive_partition_scheme"; } - /// \brief Data fragments belonging to this partition level. In some - /// partition schemes such as Hive-style, this member is - /// mutually-exclusive with subpartitions, where data fragments - /// occur only in the partition leaves - std::vector> data_fragments_; + Result> Parse(const std::string& path) const override; - /// \brief Child partitions of this partition - std::vector> subpartitions_; + std::vector GetUnconvertedKeys(const std::string& path) const; - /// \brief Default scan options to use for data fragments - std::shared_ptr scan_options_; + const std::shared_ptr& schema() { return schema_; } + + protected: + std::shared_ptr schema_; }; -/// \brief A PartitionSource that returns fragments as the result of input iterators -class ARROW_DS_EXPORT LazyPartition : public Partition { +/// \brief Implementation provided by lambda or other callable +class ARROW_DS_EXPORT FunctionPartitionScheme : public PartitionScheme { public: - const PartitionKey* key() const override; - - DataFragmentIterator GetFragments(const& DataSelector selector) override; + explicit FunctionPartitionScheme( + std::function>(const std::string&)> impl, + std::string name = "function_partition_scheme") + : impl_(std::move(impl)), name_(std::move(name)) {} - // TODO(wesm): Iterate over subpartitions + std::string name() const override { return name_; } - protected: - PartitionIterator partition_iter_; + Result> Parse(const std::string& path) const override { + return impl_(path); + } - // By default, once this source is consumed using GetFragments, it - // cannot be consumed again. By setting this to true, we cache - bool cache_manifest_ = false; + private: + std::function>(const std::string&)> impl_; + std::string name_; }; +// TODO(bkietz) use RE2 and named groups to provide RegexpPartitionScheme + } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc new file mode 100644 index 000000000000..41d90d6a052e --- /dev/null +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include + +#include "arrow/dataset/api.h" +#include "arrow/dataset/partition.h" +#include "arrow/dataset/test_util.h" +#include "arrow/filesystem/localfs.h" +#include "arrow/filesystem/path_util.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" + +namespace arrow { +namespace dataset { + +class TestPartitionScheme : public ::testing::Test { + public: + void AssertParseError(const std::string& path) { + ASSERT_RAISES(Invalid, scheme_->Parse(path).status()); + } + + void AssertParse(const std::string& path, std::shared_ptr expected) { + for (std::string suffix : {"", "/dat.parquet"}) { + ASSERT_OK_AND_ASSIGN(auto parsed, scheme_->Parse(path + suffix)); + + ASSERT_NE(parsed, nullptr); + ASSERT_TRUE(parsed->Equals(*expected)) << parsed->ToString() << "\n" + << expected->ToString(); + } + } + + void AssertParse(const std::string& path, const Expression& expected) { + AssertParse(path, expected.Copy()); + } + + protected: + std::shared_ptr scheme_; +}; + +TEST_F(TestPartitionScheme, Simple) { + auto expr = equal(field_ref("alpha"), scalar(3)); + scheme_ = std::make_shared(expr); + AssertParse("/hello/world", expr); +} + +TEST_F(TestPartitionScheme, Schema) { + scheme_ = std::make_shared( + schema({field("alpha", int32()), field("beta", utf8())})); + + AssertParse("/0/hello", "alpha"_ == int32_t(0) and "beta"_ == "hello"); + AssertParseError("/world/0"); // reversed order + AssertParseError("/3"); // valid alpha, but missing beta + AssertParseError("/0.0/foo"); // invalid alpha + AssertParseError("/3.25"); // invalid alpha with missing beta + AssertParseError(""); // no segments to parse + + // gotcha someday: + AssertParse("/0/dat.parquet", "alpha"_ == int32_t(0) and "beta"_ == "dat.parquet"); + + AssertParse("/0/foo/ignored=2341", "alpha"_ == int32_t(0) and "beta"_ == "foo"); +} + +TEST_F(TestPartitionScheme, Hive) { + scheme_ = std::make_shared( + schema({field("alpha", int32()), field("beta", float32())})); + + AssertParse("/alpha=0/beta=3.25", "alpha"_ == int32_t(0) and "beta"_ == 3.25f); + AssertParse("/beta=3.25/alpha=0", "beta"_ == 3.25f and "alpha"_ == int32_t(0)); + AssertParse("/alpha=0", "alpha"_ == int32_t(0)); + AssertParse("/beta=3.25", "beta"_ == 3.25f); + AssertParse("", scalar(true)); + + AssertParse("/alpha=0/unexpected/beta=3.25", + "alpha"_ == int32_t(0) and "beta"_ == 3.25f); + + AssertParse("/alpha=0/beta=3.25/ignored=2341", + "alpha"_ == int32_t(0) and "beta"_ == 3.25f); + + AssertParse("/ignored=2341", scalar(true)); + + AssertParseError("/alpha=0.0/beta=3.25"); // conversion of "0.0" to int32 fails +} + +template +void PopFront(size_t n, std::vector* v) { + std::move(v->begin() + n, v->end(), v->begin()); + v->resize(v->size() - n); +} + +TEST_F(TestPartitionScheme, EtlThenHive) { + SchemaPartitionScheme etl_scheme(schema({field("year", int16()), field("month", int8()), + field("day", int8()), field("hour", int8())})); + HivePartitionScheme alphabeta_scheme( + schema({field("alpha", int32()), field("beta", float32())})); + + scheme_ = std::make_shared( + [&](const std::string& path) -> Result> { + ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_scheme.Parse(path)); + + auto segments = fs::internal::SplitAbstractPath(path); + PopFront(etl_scheme.schema()->num_fields(), &segments); + ARROW_ASSIGN_OR_RAISE( + auto alphabeta_expr, + alphabeta_scheme.Parse(fs::internal::JoinAbstractPath(segments))); + + return and_(std::move(etl_expr), std::move(alphabeta_expr)); + }); + + AssertParse("/1999/12/31/00/alpha=0/beta=3.25", + "year"_ == int16_t(1999) and "month"_ == int8_t(12) and + "day"_ == int8_t(31) and "hour"_ == int8_t(0) and + ("alpha"_ == int32_t(0) and "beta"_ == 3.25f)); + + AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25"); +} + +TEST_F(TestPartitionScheme, Set) { + // An adhoc partition scheme which parses segments like "/x in [1 4 5]" + // into ("x"_ == 1 or "x"_ == 4 or "x"_ == 5) + scheme_ = std::make_shared( + [](const std::string& path) -> Result> { + std::smatch matches; + auto segment = std::move(fs::internal::SplitAbstractPath(path)[0]); + static std::regex re("^x in \\[(.*)\\]$"); + if (!std::regex_match(segment, matches, re) || matches.size() != 2) { + return Status::Invalid("regex failed to parse"); + } + + ExpressionVector subexpressions; + std::string element; + std::istringstream elements(matches[1]); + while (elements >> element) { + std::shared_ptr s; + RETURN_NOT_OK(Scalar::Parse(int32(), element, &s)); + subexpressions.push_back(equal(field_ref("x"), scalar(s))); + } + + return or_(std::move(subexpressions)); + }); + + AssertParse("/x in [1]", "x"_ == 1); + AssertParse("/x in [1 4 5]", "x"_ == 1 or "x"_ == 4 or "x"_ == 5); + AssertParse("/x in []", scalar(false)); +} + +// An adhoc partition scheme which parses segments like "/x=[-3.25, 0.0)" +// into ("x"_ >= -3.25 and "x" < 0.0) +class RangePartitionScheme : public HivePartitionScheme { + public: + using HivePartitionScheme::HivePartitionScheme; + + std::string name() const override { return "range_partition_scheme"; } + + Result> Parse(const std::string& path) const override { + ExpressionVector ranges; + for (auto key : GetUnconvertedKeys(path)) { + std::smatch matches; + RETURN_NOT_OK(DoRegex(key.value, &matches)); + + auto& min_cmp = matches[1] == "[" ? greater_equal : greater; + std::string min_repr = matches[2]; + std::string max_repr = matches[3]; + auto& max_cmp = matches[4] == "]" ? less_equal : less; + + const auto& type = schema_->GetFieldByName(key.name)->type(); + std::shared_ptr min, max; + RETURN_NOT_OK(Scalar::Parse(type, min_repr, &min)); + RETURN_NOT_OK(Scalar::Parse(type, max_repr, &max)); + + ranges.push_back(and_(min_cmp(field_ref(key.name), scalar(min)), + max_cmp(field_ref(key.name), scalar(max)))); + } + return and_(ranges); + } + + static Status DoRegex(const std::string& segment, std::smatch* matches) { + static std::regex re( + "^" + "([\\[\\(])" // open bracket or paren + "([^ ]+)" // representation of range minimum + " " + "([^ ]+)" // representation of range maximum + "([\\]\\)])" // close bracket or paren + "$"); + + if (!std::regex_match(segment, *matches, re) || matches->size() != 5) { + return Status::Invalid("regex failed to parse"); + } + + return Status::OK(); + } +}; + +TEST_F(TestPartitionScheme, Range) { + scheme_ = std::make_shared( + schema({field("x", float64()), field("y", float64()), field("z", float64())})); + + AssertParse("/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", + ("x"_ >= -1.5 and "x"_ < 0.0) and ("y"_ >= 0.0 and "y"_ < 1.5) and + ("z"_ > 1.5 and "z"_ <= 3.0)); +} + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index 0bbb0450dbd4..f71d51ee42b8 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -170,9 +170,9 @@ class Result { /// `T` must be implicitly constructible from `const U &`. /// /// \param other The value to copy from. - template ::value && - std::is_convertible::value>::type> + template ::value && + std::is_convertible::value>::type> Result(const Result& other) : variant_("unitialized") { AssignVariant(other.variant_); } @@ -191,7 +191,7 @@ class Result { /// /// \param other The Result object to move from and set to a non-OK status. template ::value && + typename E = typename std::enable_if::value && std::is_convertible::value>::type> Result(Result&& other) : variant_("unitialized") { AssignVariant(std::move(other.variant_)); @@ -284,6 +284,17 @@ class Result { } T operator*() && { return ValueOrDie(); } + /// Helper method for using Results in Status returning out arg functions + template ::value>::type> + Status Value(U* out) && { + if (!ok()) { + return status(); + } + *out = U(std::move(arrow::util::get(variant_))); + return Status::OK(); + } + private: // Assignment is disabled by default so we need to destruct/reconstruct // the value. diff --git a/cpp/src/arrow/result_test.cc b/cpp/src/arrow/result_test.cc index 92a2cdea7847..caecb115127b 100644 --- a/cpp/src/arrow/result_test.cc +++ b/cpp/src/arrow/result_test.cc @@ -67,17 +67,31 @@ struct ImplicitlyCopyConvertible { struct MoveOnlyDataType { explicit MoveOnlyDataType(int x) : data(new int(x)) {} - MoveOnlyDataType(MoveOnlyDataType&& other) : data(other.data) { other.data = nullptr; } - MoveOnlyDataType(const MoveOnlyDataType& other) = delete; MoveOnlyDataType& operator=(const MoveOnlyDataType& other) = delete; - ~MoveOnlyDataType() { - delete data; - data = nullptr; + MoveOnlyDataType(MoveOnlyDataType&& other) { MoveFrom(other); } + MoveOnlyDataType& operator=(MoveOnlyDataType&& other) { + MoveFrom(other); + return *this; + } + + ~MoveOnlyDataType() { Destroy(); } + + void Destroy() { + if (data != nullptr) { + delete data; + data = nullptr; + } } - int* data; + void MoveFrom(MoveOnlyDataType& other) { + Destroy(); + data = other.data; + other.data = nullptr; + } + + int* data = nullptr; }; struct ImplicitlyMoveConvertible { @@ -397,8 +411,8 @@ TYPED_TEST(ResultTest, MoveAssignmentSelfOkStatus) { // Verify that a Result object can be constructed from a move-only type. TEST(ResultTest, InitializationMoveOnlyType) { - std::string* str = new std::string(kStringElement); - std::unique_ptr value(str); + std::unique_ptr value(new std::string(kStringElement)); + auto str = value.get(); Result> result(std::move(value)); ASSERT_TRUE(result.ok()); @@ -407,8 +421,8 @@ TEST(ResultTest, InitializationMoveOnlyType) { // Verify that a Result object can be move-constructed from a move-only type. TEST(ResultTest, MoveConstructorMoveOnlyType) { - std::string* str = new std::string(kStringElement); - std::unique_ptr value(str); + std::unique_ptr value(new std::string(kStringElement)); + auto str = value.get(); Result> result1(std::move(value)); Result> result2(std::move(result1)); @@ -424,8 +438,8 @@ TEST(ResultTest, MoveConstructorMoveOnlyType) { // Verify that a Result object can be move-assigned to from a Result object // containing a move-only type. TEST(ResultTest, MoveAssignmentMoveOnlyType) { - std::string* str = new std::string(kStringElement); - std::unique_ptr value(str); + std::unique_ptr value(new std::string(kStringElement)); + auto str = value.get(); Result> result1(std::move(value)); Result> result2(Status(kErrorCode, kErrorMessage)); @@ -443,8 +457,8 @@ TEST(ResultTest, MoveAssignmentMoveOnlyType) { // Verify that a value can be moved out of a Result object via ValueOrDie(). TEST(ResultTest, ValueOrDieMovedValue) { - std::string* str = new std::string(kStringElement); - std::unique_ptr value(str); + std::unique_ptr value(new std::string(kStringElement)); + auto str = value.get(); Result> result(std::move(value)); std::unique_ptr moved_value = std::move(result).ValueOrDie(); @@ -475,6 +489,44 @@ TEST(ResultTest, TemplateValueMoveConstruction) { EXPECT_EQ(*result.ValueOrDie().move_only.data, kIntElement); } +// Verify that a Result can be unpacked to T +TEST(ResultTest, StatusReturnAdapterCopyValue) { + Result result(CopyOnlyDataType{kIntElement}); + CopyOnlyDataType copy_only{0}; + + EXPECT_TRUE(std::move(result).Value(©_only).ok()); + EXPECT_EQ(copy_only.data, kIntElement); +} + +// Verify that a Result can be unpacked to some U, where U is +// a type which has a constructor taking a const T &. +TEST(ResultTest, StatusReturnAdapterCopyAndConvertValue) { + Result result(CopyOnlyDataType{kIntElement}); + ImplicitlyCopyConvertible implicitly_convertible(CopyOnlyDataType{0}); + + EXPECT_TRUE(std::move(result).Value(&implicitly_convertible).ok()); + EXPECT_EQ(implicitly_convertible.copy_only.data, kIntElement); +} + +// Verify that a Result can be unpacked to T +TEST(ResultTest, StatusReturnAdapterMoveValue) { + Result result(MoveOnlyDataType{kIntElement}); + MoveOnlyDataType move_only{0}; + + EXPECT_TRUE(std::move(result).Value(&move_only).ok()); + EXPECT_EQ(*move_only.data, kIntElement); +} + +// Verify that a Result can be unpacked to some U, where U is +// a type which has a constructor taking a T &&. +TEST(ResultTest, StatusReturnAdapterMoveAndConvertValue) { + Result result(MoveOnlyDataType{kIntElement}); + ImplicitlyMoveConvertible implicitly_convertible(MoveOnlyDataType{0}); + + EXPECT_TRUE(std::move(result).Value(&implicitly_convertible).ok()); + EXPECT_EQ(*implicitly_convertible.move_only.data, kIntElement); +} + // Verify that a Result is assignable to a Result, where T // is a type which has an implicit constructor taking a const U &. TEST(ResultTest, TemplateCopyAssign) { diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 10019906df18..909d50ca52d4 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -18,6 +18,8 @@ #include "arrow/scalar.h" #include +#include +#include #include "arrow/array.h" #include "arrow/buffer.h" @@ -26,6 +28,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/parsing.h" #include "arrow/visitor_inline.h" namespace arrow { @@ -58,6 +61,9 @@ DurationScalar::DurationScalar(int64_t value, const std::shared_ptr& t DCHECK_EQ(Type::DURATION, type->id()); } +MonthIntervalScalar::MonthIntervalScalar(int32_t value, bool is_valid) + : internal::PrimitiveScalar{month_interval(), is_valid}, value(value) {} + MonthIntervalScalar::MonthIntervalScalar(int32_t value, const std::shared_ptr& type, bool is_valid) @@ -67,6 +73,10 @@ MonthIntervalScalar::MonthIntervalScalar(int32_t value, checked_cast(type.get())->interval_type()); } +DayTimeIntervalScalar::DayTimeIntervalScalar(DayTimeIntervalType::DayMilliseconds value, + bool is_valid) + : internal::PrimitiveScalar{day_time_interval(), is_valid}, value(value) {} + DayTimeIntervalScalar::DayTimeIntervalScalar(DayTimeIntervalType::DayMilliseconds value, const std::shared_ptr& type, bool is_valid) @@ -76,6 +86,9 @@ DayTimeIntervalScalar::DayTimeIntervalScalar(DayTimeIntervalType::DayMillisecond checked_cast(type.get())->interval_type()); } +StringScalar::StringScalar(std::string s) + : StringScalar(Buffer::FromString(std::move(s)), true) {} + FixedSizeBinaryScalar::FixedSizeBinaryScalar(const std::shared_ptr& value, const std::shared_ptr& type, bool is_valid) @@ -95,36 +108,23 @@ BaseListScalar::BaseListScalar(const std::shared_ptr& value, BaseListScalar::BaseListScalar(const std::shared_ptr& value, bool is_valid) : BaseListScalar(value, value->type(), is_valid) {} -MapScalar::MapScalar(const std::shared_ptr& keys, - const std::shared_ptr& items, - const std::shared_ptr& type, bool is_valid) - : Scalar{type, is_valid}, keys(keys), items(items) {} - -MapScalar::MapScalar(const std::shared_ptr& keys, - const std::shared_ptr& values, bool is_valid) - : MapScalar(keys, values, map(keys->type(), values->type()), is_valid) {} - FixedSizeListScalar::FixedSizeListScalar(const std::shared_ptr& value, const std::shared_ptr& type, bool is_valid) - : Scalar{type, is_valid}, value(value) { + : BaseListScalar(value, type, is_valid) { ARROW_CHECK_EQ(value->length(), checked_cast(type.get())->list_size()); } -FixedSizeListScalar::FixedSizeListScalar(const std::shared_ptr& value, - bool is_valid) - : FixedSizeListScalar(value, value->type(), is_valid) {} - +// TODO(bkietz) This doesn't need a factory. Just rewrite all scalars to be generically +// constructible (is_simple_scalar should apply to all scalars) struct MakeNullImpl { - template - using ScalarType = typename TypeTraits::ScalarType; - - template - typename std::enable_if>::value, - Status>::type - Visit(const T&) { - *out_ = std::make_shared>(); + template ::ScalarType, + typename ValueType = typename ScalarType::ValueType, + typename Enable = typename std::enable_if< + internal::is_simple_scalar::value>::type> + Status Visit(const T&) { + *out_ = std::make_shared(ValueType(), type_, false); return Status::OK(); } @@ -132,7 +132,7 @@ struct MakeNullImpl { return Status::NotImplemented("construcing null scalars of type ", t); } - std::shared_ptr type_; + const std::shared_ptr& type_; std::shared_ptr* out_; }; @@ -142,4 +142,56 @@ Status MakeNullScalar(const std::shared_ptr& type, return VisitTypeInline(*type, &impl); } +struct ScalarParseImpl { + template , + typename Value = typename Converter::value_type> + Status Visit(const T& t) { + Value value; + if (!Converter{type_}(s_.data(), s_.size(), &value)) { + return Status::Invalid("error parsing '", s_, "' as scalar of type ", t); + } + return Finish(std::move(value)); + } + + Status Visit(const BinaryType&) { return FinishWithBuffer(); } + + Status Visit(const LargeBinaryType&) { return FinishWithBuffer(); } + + Status Visit(const FixedSizeBinaryType& t) { return FinishWithBuffer(); } + + Status Visit(const DataType& t) { + return Status::NotImplemented("parsing scalars of type ", t); + } + + template + Status Finish(Arg&& arg) { + return MakeScalar(type_, std::forward(arg), out_); + } + + Status FinishWithBuffer() { return Finish(Buffer::FromString(s_.to_string())); } + + ScalarParseImpl(const std::shared_ptr& type, util::string_view s, + std::shared_ptr* out) + : type_(type), s_(s), out_(out) {} + + const std::shared_ptr& type_; + util::string_view s_; + std::shared_ptr* out_; +}; + +Status Scalar::Parse(const std::shared_ptr& type, util::string_view s, + std::shared_ptr* out) { + ScalarParseImpl impl = {type, s, out}; + return VisitTypeInline(*type, &impl); +} + +namespace internal { +Status CheckBufferLength(const FixedSizeBinaryType* t, const std::shared_ptr* b) { + return t->byte_width() == (*b)->size() + ? Status::OK() + : Status::Invalid("buffer length ", (*b)->size(), " is not compatible with ", + *t); +} +} // namespace internal + } // namespace arrow diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index c407ebc29c92..5ddb815d4f25 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -24,12 +24,16 @@ #pragma once #include +#include +#include #include #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -53,6 +57,9 @@ struct ARROW_EXPORT Scalar { return false; } + static Status Parse(const std::shared_ptr& type, util::string_view s, + std::shared_ptr* out); + protected: Scalar(const std::shared_ptr& type, bool is_valid) : type(type), is_valid(is_valid) {} @@ -66,6 +73,22 @@ struct ARROW_EXPORT NullScalar : public Scalar { namespace internal { +template +struct is_simple_scalar : std::false_type {}; + +template +struct is_simple_scalar< + T, + typename std::enable_if< + // scalar has a single extra data member named "value" with type "ValueType" + std::is_same().value), typename T::ValueType>::value && + // scalar is constructible from (value, type, is_valid) + std::is_constructible, + bool>::value>::type> : std::true_type {}; + +template +using enable_if_simple_scalar = std::enable_if::value, R>; + struct ARROW_EXPORT PrimitiveScalar : public Scalar { using Scalar::Scalar; }; @@ -73,29 +96,44 @@ struct ARROW_EXPORT PrimitiveScalar : public Scalar { } // namespace internal struct ARROW_EXPORT BooleanScalar : public internal::PrimitiveScalar { + using ValueType = bool; + bool value; + explicit BooleanScalar(bool value, bool is_valid = true) : internal::PrimitiveScalar{boolean(), is_valid}, value(value) {} + BooleanScalar() : BooleanScalar(false, false) {} + + BooleanScalar(bool value, const std::shared_ptr& type, bool is_valid = true) + : BooleanScalar(value, is_valid) { + ARROW_CHECK_EQ(type->id(), Type::BOOL); + } }; -template +template struct NumericScalar : public internal::PrimitiveScalar { - using T = typename Type::c_type; - T value; + using ValueType = typename T::c_type; - explicit NumericScalar(T value, bool is_valid = true) - : NumericScalar(value, TypeTraits::type_singleton(), is_valid) {} + ValueType value; + + explicit NumericScalar(ValueType value, bool is_valid = true) + : internal::PrimitiveScalar(TypeTraits::type_singleton(), is_valid), + value(value) {} NumericScalar() : NumericScalar(0, false) {} - protected: - explicit NumericScalar(T value, const std::shared_ptr& type, bool is_valid) - : internal::PrimitiveScalar{type, is_valid}, value(value) {} + NumericScalar(ValueType value, const std::shared_ptr& type, + bool is_valid = true) + : NumericScalar(value, is_valid) { + ARROW_CHECK_EQ(type->id(), T::type_id); + } }; -template +template struct BaseBinaryScalar : public Scalar { + using ValueType = std::shared_ptr; + std::shared_ptr value; protected: @@ -105,35 +143,47 @@ struct BaseBinaryScalar : public Scalar { }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { + BinaryScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true) + : BaseBinaryScalar(value, type, is_valid) {} + explicit BinaryScalar(const std::shared_ptr& value, bool is_valid = true) - : BaseBinaryScalar(value, binary(), is_valid) {} + : BinaryScalar(value, binary(), is_valid) {} BinaryScalar() : BinaryScalar(NULLPTR, false) {} - - protected: - using BaseBinaryScalar::BaseBinaryScalar; }; struct ARROW_EXPORT StringScalar : public BinaryScalar { + StringScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true) + : BinaryScalar(value, type, is_valid) {} + explicit StringScalar(const std::shared_ptr& value, bool is_valid = true) - : BinaryScalar(value, utf8(), is_valid) {} + : StringScalar(value, utf8(), is_valid) {} + + explicit StringScalar(std::string s); StringScalar() : StringScalar(NULLPTR, false) {} }; struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { + LargeBinaryScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true) + : BaseBinaryScalar(value, type, is_valid) {} + explicit LargeBinaryScalar(const std::shared_ptr& value, bool is_valid = true) - : BaseBinaryScalar(value, large_binary(), is_valid) {} + : LargeBinaryScalar(value, large_binary(), is_valid) {} LargeBinaryScalar() : LargeBinaryScalar(NULLPTR, false) {} - - protected: - using BaseBinaryScalar::BaseBinaryScalar; }; struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { + LargeStringScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true) + : LargeBinaryScalar(value, type, is_valid) {} + explicit LargeStringScalar(const std::shared_ptr& value, bool is_valid = true) - : LargeBinaryScalar(value, large_utf8(), is_valid) {} + : LargeStringScalar(value, large_utf8(), is_valid) {} LargeStringScalar() : LargeStringScalar(NULLPTR, false) {} }; @@ -155,59 +205,86 @@ class ARROW_EXPORT Date64Scalar : public NumericScalar { class ARROW_EXPORT Time32Scalar : public internal::PrimitiveScalar { public: - int32_t value; + using ValueType = int32_t; + Time32Scalar(int32_t value, const std::shared_ptr& type, bool is_valid = true); + + int32_t value; }; class ARROW_EXPORT Time64Scalar : public internal::PrimitiveScalar { public: - int64_t value; + using ValueType = int64_t; + Time64Scalar(int64_t value, const std::shared_ptr& type, bool is_valid = true); + + int64_t value; }; class ARROW_EXPORT TimestampScalar : public internal::PrimitiveScalar { public: - int64_t value; + using ValueType = int64_t; + TimestampScalar(int64_t value, const std::shared_ptr& type, bool is_valid = true); + + int64_t value; }; class ARROW_EXPORT DurationScalar : public internal::PrimitiveScalar { public: - int64_t value; + using ValueType = int64_t; + DurationScalar(int64_t value, const std::shared_ptr& type, bool is_valid = true); + + int64_t value; }; class ARROW_EXPORT MonthIntervalScalar : public internal::PrimitiveScalar { public: - int32_t value; + using ValueType = int32_t; + + explicit MonthIntervalScalar(int32_t value, bool is_valid = true); MonthIntervalScalar(int32_t value, const std::shared_ptr& type, bool is_valid = true); + + int32_t value; }; class ARROW_EXPORT DayTimeIntervalScalar : public internal::PrimitiveScalar { public: - DayTimeIntervalType::DayMilliseconds value; + using ValueType = DayTimeIntervalType::DayMilliseconds; + + explicit DayTimeIntervalScalar(DayTimeIntervalType::DayMilliseconds value, + bool is_valid = true); + DayTimeIntervalScalar(DayTimeIntervalType::DayMilliseconds value, const std::shared_ptr& type, bool is_valid = true); + + DayTimeIntervalType::DayMilliseconds value; }; struct ARROW_EXPORT Decimal128Scalar : public Scalar { - Decimal128 value; + using ValueType = Decimal128; + Decimal128Scalar(const Decimal128& value, const std::shared_ptr& type, bool is_valid = true); + + Decimal128 value; }; struct ARROW_EXPORT BaseListScalar : public Scalar { - std::shared_ptr value; + using ValueType = std::shared_ptr; BaseListScalar(const std::shared_ptr& value, const std::shared_ptr& type, bool is_valid = true); BaseListScalar(const std::shared_ptr& value, bool is_valid); + + std::shared_ptr value; }; struct ARROW_EXPORT ListScalar : public BaseListScalar { @@ -218,27 +295,19 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar { using BaseListScalar::BaseListScalar; }; -struct ARROW_EXPORT MapScalar : public Scalar { - std::shared_ptr keys; - std::shared_ptr items; - - MapScalar(const std::shared_ptr& keys, const std::shared_ptr& values, - const std::shared_ptr& type, bool is_valid = true); - - MapScalar(const std::shared_ptr& keys, const std::shared_ptr& values, - bool is_valid = true); +struct ARROW_EXPORT MapScalar : public BaseListScalar { + using BaseListScalar::BaseListScalar; }; -struct ARROW_EXPORT FixedSizeListScalar : public Scalar { - std::shared_ptr value; - +struct ARROW_EXPORT FixedSizeListScalar : public BaseListScalar { FixedSizeListScalar(const std::shared_ptr& value, const std::shared_ptr& type, bool is_valid = true); - explicit FixedSizeListScalar(const std::shared_ptr& value, bool is_valid = true); + using BaseListScalar::BaseListScalar; }; struct ARROW_EXPORT StructScalar : public Scalar { + using ValueType = std::vector>; std::vector> value; }; @@ -253,4 +322,58 @@ ARROW_EXPORT Status MakeNullScalar(const std::shared_ptr& type, std::shared_ptr* null); +namespace internal { + +inline Status CheckBufferLength(...) { return Status::OK(); } + +ARROW_EXPORT Status CheckBufferLength(const FixedSizeBinaryType* t, + const std::shared_ptr* b); + +}; // namespace internal + +template +struct MakeScalarImpl { + template < + typename T, typename ScalarType = typename TypeTraits::ScalarType, + typename ValueType = typename ScalarType::ValueType, + typename Enable = typename std::enable_if< + internal::is_simple_scalar::value && + std::is_same::type>::value>::type> + Status Visit(const T& t) { + ARROW_RETURN_NOT_OK(internal::CheckBufferLength(&t, &value_)); + *out_ = std::make_shared(ValueType(static_cast(value_)), type_, + true); + return Status::OK(); + } + + Status Visit(const DataType& t) { + return Status::NotImplemented("constructing scalars of type ", t, " from ", value_); + } + + const std::shared_ptr& type_; + ValueRef value_; + std::shared_ptr* out_; +}; + +template +Status MakeScalar(const std::shared_ptr& type, Value&& value, + std::shared_ptr* out) { + MakeScalarImpl impl = {type, std::forward(value), out}; + return VisitTypeInline(*type, &impl); +} + +/// \brief type inferring scalar factory +template ::type>, + typename ScalarType = typename Traits::ScalarType, + typename Enable = decltype(ScalarType(std::declval(), + Traits::type_singleton(), true))> +std::shared_ptr MakeScalar(Value value) { + return std::make_shared(std::move(value), Traits::type_singleton(), true); +} + +template +std::shared_ptr MakeScalar(const char (&value)[N]) { + return std::make_shared(value); +} + } // namespace arrow diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 6bd20556a19f..baf3aad1ae1f 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -15,23 +15,6 @@ // specific language governing permissions and limitations // under the License. -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - #include #include #include @@ -87,6 +70,21 @@ TYPED_TEST(TestNumericScalar, Basics) { ASSERT_FALSE(stack_val.is_valid); } +TYPED_TEST(TestNumericScalar, MakeScalar) { + using T = typename TypeParam::c_type; + using ScalarType = typename TypeTraits::ScalarType; + + std::shared_ptr three = MakeScalar(static_cast(3)); + ASSERT_TRUE(ScalarType(3).Equals(three)); + + ASSERT_OK( + MakeScalar(TypeTraits::type_singleton(), static_cast(3), &three)); + ASSERT_TRUE(ScalarType(3).Equals(three)); + + ASSERT_OK(Scalar::Parse(TypeTraits::type_singleton(), "3", &three)); + ASSERT_TRUE(ScalarType(3).Equals(three)); +} + TEST(TestBinaryScalar, Basics) { std::string data = "test data"; auto buf = std::make_shared(data); @@ -121,6 +119,17 @@ TEST(TestBinaryScalar, Basics) { ASSERT_FALSE(null_value2.is_valid); } +TEST(TestStringScalar, MakeScalar) { + auto three = MakeScalar("three"); + ASSERT_TRUE(StringScalar("three").Equals(three)); + + ASSERT_OK(MakeScalar(utf8(), Buffer::FromString("three"), &three)); + ASSERT_TRUE(StringScalar("three").Equals(three)); + + ASSERT_OK(Scalar::Parse(utf8(), "three", &three)); + ASSERT_TRUE(StringScalar("three").Equals(three)); +} + TEST(TestFixedSizeBinaryScalar, Basics) { std::string data = "test data"; auto buf = std::make_shared(data); @@ -133,6 +142,23 @@ TEST(TestFixedSizeBinaryScalar, Basics) { ASSERT_TRUE(value.type->Equals(*ex_type)); } +TEST(TestFixedSizeBinaryScalar, MakeScalar) { + std::string data = "test data"; + auto buf = std::make_shared(data); + auto type = fixed_size_binary(9); + + std::shared_ptr s; + ASSERT_OK(MakeScalar(type, buf, &s)); + ASSERT_TRUE(FixedSizeBinaryScalar(buf, type).Equals(s)); + + ASSERT_OK(Scalar::Parse(type, util::string_view(data), &s)); + ASSERT_TRUE(FixedSizeBinaryScalar(buf, type).Equals(s)); + + // wrong length: + ASSERT_RAISES(Invalid, MakeScalar(type, Buffer::FromString(data.substr(3)), &s)); + ASSERT_RAISES(Invalid, Scalar::Parse(type, util::string_view(data).substr(3), &s)); +} + TEST(TestDateScalars, Basics) { int32_t i32_val = 1; Date32Scalar date32_val(i32_val); @@ -151,6 +177,17 @@ TEST(TestDateScalars, Basics) { ASSERT_FALSE(date64_null.is_valid); } +TEST(TestDateScalars, MakeScalar) { + std::shared_ptr s; + ASSERT_OK(MakeScalar(date32(), int32_t(1), &s)); + ASSERT_TRUE(Date32Scalar(1).Equals(s)); + + ASSERT_OK(MakeScalar(date64(), int64_t(1), &s)); + ASSERT_TRUE(Date64Scalar(1).Equals(s)); + + ASSERT_RAISES(NotImplemented, Scalar::Parse(date64(), "", &s)); +} + TEST(TestTimeScalars, Basics) { auto type1 = time32(TimeUnit::MILLI); auto type2 = time32(TimeUnit::SECOND); @@ -176,6 +213,29 @@ TEST(TestTimeScalars, Basics) { ASSERT_TRUE(time64_null.type->Equals(*type4)); } +TEST(TestTimeScalars, MakeScalar) { + auto type1 = time32(TimeUnit::MILLI); + auto type2 = time32(TimeUnit::SECOND); + auto type3 = time64(TimeUnit::MICRO); + auto type4 = time64(TimeUnit::NANO); + + std::shared_ptr s; + + ASSERT_OK(MakeScalar(type1, int32_t(1), &s)); + ASSERT_TRUE(Time32Scalar(1, type1).Equals(s)); + + ASSERT_OK(MakeScalar(type2, int32_t(1), &s)); + ASSERT_TRUE(Time32Scalar(1, type2).Equals(s)); + + ASSERT_OK(MakeScalar(type3, int64_t(1), &s)); + ASSERT_TRUE(Time64Scalar(1, type3).Equals(s)); + + ASSERT_OK(MakeScalar(type4, int64_t(1), &s)); + ASSERT_TRUE(Time64Scalar(1, type4).Equals(s)); + + ASSERT_RAISES(NotImplemented, Scalar::Parse(type4, "", &s)); +} + TEST(TestTimestampScalars, Basics) { auto type1 = timestamp(TimeUnit::MILLI); auto type2 = timestamp(TimeUnit::SECOND); @@ -199,6 +259,37 @@ TEST(TestTimestampScalars, Basics) { ASSERT_FALSE(ts_val2.Equals(ts_null)); } +TEST(TestTimestampScalars, MakeScalar) { + auto type1 = timestamp(TimeUnit::MILLI); + auto type2 = timestamp(TimeUnit::SECOND); + auto type3 = timestamp(TimeUnit::MICRO); + auto type4 = timestamp(TimeUnit::NANO); + + std::shared_ptr s; + + util::string_view epoch_plus_1s = "1970-01-01 00:00:01"; + + ASSERT_OK(MakeScalar(type1, int64_t(1), &s)); + ASSERT_TRUE(TimestampScalar(1, type1).Equals(s)); + ASSERT_OK(Scalar::Parse(type1, epoch_plus_1s, &s)); + ASSERT_TRUE(TimestampScalar(1000, type1).Equals(s)); + + ASSERT_OK(MakeScalar(type2, int64_t(1), &s)); + ASSERT_TRUE(TimestampScalar(1, type2).Equals(s)); + ASSERT_OK(Scalar::Parse(type2, epoch_plus_1s, &s)); + ASSERT_TRUE(TimestampScalar(1, type2).Equals(s)); + + ASSERT_OK(MakeScalar(type3, int64_t(1), &s)); + ASSERT_TRUE(TimestampScalar(1, type3).Equals(s)); + ASSERT_OK(Scalar::Parse(type3, epoch_plus_1s, &s)); + ASSERT_TRUE(TimestampScalar(1000 * 1000, type3).Equals(s)); + + ASSERT_OK(MakeScalar(type4, int64_t(1), &s)); + ASSERT_TRUE(TimestampScalar(1, type4).Equals(s)); + ASSERT_OK(Scalar::Parse(type4, epoch_plus_1s, &s)); + ASSERT_TRUE(TimestampScalar(1000 * 1000 * 1000, type4).Equals(s)); +} + TEST(TestDurationScalars, Basics) { auto type1 = duration(TimeUnit::MILLI); auto type2 = duration(TimeUnit::SECOND); @@ -227,9 +318,9 @@ TEST(TestMonthIntervalScalars, Basics) { int32_t val1 = 1; int32_t val2 = 2; - MonthIntervalScalar ts_val1(val1, type); - MonthIntervalScalar ts_val2(val2, type); - MonthIntervalScalar ts_null(val2, type, false); + MonthIntervalScalar ts_val1(val1); + MonthIntervalScalar ts_val2(val2); + MonthIntervalScalar ts_null(val2, false); ASSERT_EQ(val1, ts_val1.value); ASSERT_EQ(val2, ts_null.value); @@ -249,9 +340,9 @@ TEST(TestDayTimeIntervalScalars, Basics) { DayTimeIntervalType::DayMilliseconds val1 = {1, 1}; DayTimeIntervalType::DayMilliseconds val2 = {2, 2}; - DayTimeIntervalScalar ts_val1(val1, type); - DayTimeIntervalScalar ts_val2(val2, type); - DayTimeIntervalScalar ts_null(val2, type, false); + DayTimeIntervalScalar ts_val1(val1); + DayTimeIntervalScalar ts_val2(val2); + DayTimeIntervalScalar ts_null(val2, false); ASSERT_EQ(val1, ts_val1.value); ASSERT_EQ(val2, ts_null.value); diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index 3238cf23db41..f063e91c20cf 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -42,12 +42,33 @@ namespace stl { namespace internal { +template +struct is_optional_like : public std::false_type {}; + +template +struct is_dereferencable : public std::false_type {}; + +template +struct is_dereferencable())>> + : public std::true_type {}; + +template +struct is_optional_like< + T, typename std::enable_if< + std::is_constructible::value && is_dereferencable::value && + !std::is_array::type>::value>::type> + : public std::true_type {}; + template -using BareTupleElement = typename std::remove_const::type>::type>::type; +using BareTupleElement = + typename std::decay::type>::type; } // namespace internal +template +using enable_if_optional_like = + typename std::enable_if::value, R>::type; + /// Traits meta class to map standard C/C++ types to equivalent Arrow types. template struct ConversionTraits {}; @@ -81,7 +102,6 @@ Status AppendListValues(CBuilderType& value_builder, Range&& cell_ra size_t j) { \ return array.Value(j); \ } \ - constexpr static bool nullable = false; \ }; \ \ template <> \ @@ -111,7 +131,6 @@ struct ConversionTraits : public CTypeTraits { static std::string GetEntry(const StringArray& array, size_t j) { return array.GetString(j); } - constexpr static bool nullable = false; }; /// Append cell range elements as a single value to the list builder. @@ -159,19 +178,15 @@ struct ConversionTraits> } return vec; } - - constexpr static bool nullable = false; }; template struct ConversionTraits> - : public CTypeTraits { - // Dependent names from base template class needs to be brought into scope. - using typename CTypeTraits::OptionalInnerType; - using typename CTypeTraits::ArrowType; - using CTypeTraits::type_singleton; - - constexpr static bool nullable = true; + : public CTypeTraits())>::type> { + using OptionalInnerType = + typename std::decay())>::type; + using typename CTypeTraits::ArrowType; + using CTypeTraits::type_singleton; static Status AppendRow(typename TypeTraits::BuilderType& builder, const Optional& cell) { @@ -200,8 +215,8 @@ struct SchemaFromTuple { const std::vector& names) { std::vector> ret = SchemaFromTuple::MakeSchemaRecursion(names); - std::shared_ptr type = CTypeTraits::type_singleton(); - ret.push_back(field(names[N - 1], type, ConversionTraits::nullable)); + auto type = ConversionTraits::type_singleton(); + ret.push_back(field(names[N - 1], type, internal::is_optional_like::value)); return ret; } @@ -232,7 +247,8 @@ struct SchemaFromTuple { std::vector> ret = SchemaFromTuple::MakeSchemaRecursionT(names); std::shared_ptr type = ConversionTraits::type_singleton(); - ret.push_back(field(get(names), type, ConversionTraits::nullable)); + ret.push_back( + field(get(names), type, internal::is_optional_like::value)); return ret; } diff --git a/cpp/src/arrow/stl_test.cc b/cpp/src/arrow/stl_test.cc index 1e93c7e97179..f259662271c9 100644 --- a/cpp/src/arrow/stl_test.cc +++ b/cpp/src/arrow/stl_test.cc @@ -123,8 +123,6 @@ namespace stl { template <> struct ConversionTraits : public CTypeTraits { - constexpr static bool nullable = true; - static Status AppendRow(typename TypeTraits::BuilderType& builder, const CustomOptionalTypeMock& cell) { if (cell) { @@ -137,9 +135,7 @@ struct ConversionTraits template <> struct ConversionTraits : public CTypeTraits { - constexpr static bool nullable = false; - - // AppendRow is not needed, since it shouldn't be called. + // AppendRow is not needed, explicitly elide an implementation }; template <> diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index c2deddc3734f..527812726089 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -34,7 +34,7 @@ namespace arrow { template struct TypeTraits {}; -template +template struct CTypeTraits {}; template <> @@ -171,6 +171,7 @@ struct TypeTraits { return elements * static_cast(sizeof(DayTimeIntervalType::DayMilliseconds)); } constexpr static bool is_parameter_free = true; + static std::shared_ptr type_singleton() { return day_time_interval(); } }; template <> @@ -183,6 +184,7 @@ struct TypeTraits { return elements * static_cast(sizeof(int32_t)); } constexpr static bool is_parameter_free = true; + static std::shared_ptr type_singleton() { return month_interval(); } }; template <> @@ -283,9 +285,10 @@ struct CTypeTraits : public TypeTraits { }; template <> -struct CTypeTraits : public TypeTraits { - using ArrowType = StringType; -}; +struct CTypeTraits : public CTypeTraits {}; + +template +struct CTypeTraits : public CTypeTraits {}; template <> struct CTypeTraits @@ -377,39 +380,8 @@ struct make_void { template using void_t = typename make_void::type; -template -struct is_dereferencable : public std::false_type {}; - -template -struct is_dereferencable())>> - : public std::true_type {}; - -template -struct is_optional_like : public std::false_type {}; - -template -struct is_optional_like::value && - is_dereferencable::value>::type> - : public std::true_type {}; - } // namespace internal -template -using enable_if_optional_like = - typename std::enable_if::value, R>::type; - -template -struct CTypeTraits> { - using OptionalInnerType = - typename std::decay())>::type; - using ArrowType = typename CTypeTraits::ArrowType; - - static std::shared_ptr<::arrow::DataType> type_singleton() { - return CTypeTraits::type_singleton(); - } -}; - // // Useful type predicates // From b5ccbd2841668d8ae55c6fdf8d2a774fe42069e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Oct 2019 12:46:09 -0500 Subject: [PATCH 181/201] ARROW-6771: [Packaging][Python] Missing pytest dependency from conda and wheel builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5569 from kszucs/ARROW-6771 and squashes the following commits: 43cceeae3 drop wheel-win-cp35m f61a1eaf2 use conda-forge 1baeb43c2 use conda_env_python in windows wheels ebd42dec9 properly skip is minio binary is missing 350aa4f9c conda test dependency Authored-by: Krisztián Szűcs Signed-off-by: Wes McKinney --- dev/tasks/conda-recipes/pyarrow/meta.yaml | 1 + dev/tasks/python-wheels/win-build.bat | 6 ++++-- dev/tasks/tasks.yml | 11 ----------- python/pyarrow/tests/conftest.py | 2 +- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/dev/tasks/conda-recipes/pyarrow/meta.yaml b/dev/tasks/conda-recipes/pyarrow/meta.yaml index c970b1c4bfc9..51b6387433f8 100644 --- a/dev/tasks/conda-recipes/pyarrow/meta.yaml +++ b/dev/tasks/conda-recipes/pyarrow/meta.yaml @@ -52,6 +52,7 @@ test: requires: - pytest + - pytest-lazy-fixture - hypothesis commands: diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat index dbb4e47f4f83..23410f6308d9 100644 --- a/dev/tasks/python-wheels/win-build.bat +++ b/dev/tasks/python-wheels/win-build.bat @@ -79,8 +79,10 @@ set ARROW_TEST_DATA=%ARROW_SRC%\testing\data @rem test the wheel @rem TODO For maximum reliability, we should test in a plain virtualenv instead. -call conda create -n wheel-test -q -y python=%PYTHON_VERSION% ^ - numpy=%NUMPY_VERSION% pandas cython pytest hypothesis || exit /B +call conda create -n wheel-test -c conda-forge -q -y ^ + --file %ARROW_SRC%\ci\conda_env_python.yml ^ + python=%PYTHON_VERSION% ^ + numpy=%NUMPY_VERSION% || exit /B call activate wheel-test @rem install the built wheel diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 3c19451d08b8..54efaa0d1868 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -46,7 +46,6 @@ groups: - wheel-osx-cp35m - wheel-osx-cp36m - wheel-osx-cp37m - - wheel-win-cp35m - wheel-win-cp36m - wheel-win-cp37m @@ -146,7 +145,6 @@ groups: - wheel-osx-cp35m - wheel-osx-cp36m - wheel-osx-cp37m - - wheel-win-cp35m - wheel-win-cp36m - wheel-win-cp37m - debian-stretch @@ -469,15 +467,6 @@ tasks: ############################## Wheel Windows ################################ - wheel-win-cp35m: - ci: appveyor - platform: win - template: python-wheels/appveyor.yml - params: - python_version: 3.5 - artifacts: - - pyarrow-{no_rc_version}-cp35-cp35m-win_amd64.whl - wheel-win-cp36m: ci: appveyor platform: win diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 1b980080d50b..5f815475dc96 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -259,7 +259,7 @@ def minio_server(): proc = None try: proc = subprocess.Popen(args, env=env) - except IOError: + except (OSError, IOError): pytest.skip('`minio` command cannot be located') else: yield address, access_key, secret_key From cc05a89e84489b8a2c813727c54d23b1ffbdf8b0 Mon Sep 17 00:00:00 2001 From: Adam Krebs Date: Thu, 3 Oct 2019 13:57:45 -0500 Subject: [PATCH 182/201] ARROW-6785: [JS] Remove superfluous child assignment From `type.mjs`: ![Screenshot 2019-09-16 15 15 45](https://user-images.githubusercontent.com/931368/64986774-c77b8b00-d895-11e9-9ed7-7d101b283361.png) TypeScript's data modifiers [automatically create a `this.children = children` assignment](http://www.typescriptlang.org/play/#code/KYDwDg9gTgLgBAYwDYEMDOa4GUZQK4IwA8AKnKDMAHYAmmA3nANoDWwAngFxxq4CWVAOYBdbgBEUMFCXZhgcAL5wAvHBRV2APnIhKtTBKky5pWcAB0OfIQA0cEtvoAoOK8QQqvazGgAKMHgARkh8CHBQwCg0HkjsiAAWfEg0EVTcAGJ8wMmkrBwQAGb2wppMwgCUcM5uNTx4clC+5QDcLrWuMIlo5giJyakqCUkp1K01Cm2uAcGhcILA8DBmAJI0TVXhC3hQVPZmlrgE8Oh7cgfezYqTcNMhYT5WAoLrjBEw27sABlZHRPQAJPROnxur1hqlzABbFBgXy+AqVZTaT6AgrmKgoSHABScVHmJZyBSfcrmABWEAEvk+dmJCgUmk+lwmNTAUAglEIwBoPCkMFmTCw7EhgQgSHxEEeQhIKBEgzhrPZEG4P0IiMc1xqviI6i0NzZPhJYP61EGVDwSCQY3am3eOz1ioFQpFYoe-ClMuEgwA5CqYF6ra4FOVfL7zAqfATgC0nAogA). I suggest matching the other code and just removing the modifier instead of removing the assignment. Closes #5394 from akre54/patch-1 and squashes the following commits: ea97ebafe Update js/src/type.ts 32a126ee4 Remove superfluous child assignment Authored-by: Adam Krebs Signed-off-by: Wes McKinney --- js/src/type.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/js/src/type.ts b/js/src/type.ts index 602a83817cae..2f5e33ce1a1c 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -428,7 +428,8 @@ export class List extends DataType extends DataType { TArray: IterableArrayLike>; TValue: RowLike; dataTypes: T; } /** @ignore */ export class Struct extends DataType { - constructor(public readonly children: Field[]) { + public readonly children: Field[]; + constructor(children: Field[]) { super(); this.children = children; } From 21636fa0a53a05063001e56eee9941cc0858b630 Mon Sep 17 00:00:00 2001 From: Kyle McCarthy Date: Thu, 3 Oct 2019 15:45:14 -0400 Subject: [PATCH 183/201] ARROW-6744: [Rust] Publicly expose JsonEqual Jira Issue: https://issues.apache.org/jira/browse/ARROW-6744 As of right now, devs using arrow in their own project cannot implement the Array trait since it is bound by JsonEqual, which isn't exported publicly. I am not sure if this was done intentionally, but if not this PR will resolve the problem. Closes #5549 from kyle-mccarthy/master and squashes the following commits: fc2175008 ARROW-6744: Publicly expose JsonEqual Authored-by: Kyle McCarthy Signed-off-by: Paddy Horan --- rust/arrow/src/array/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index 47e4219c8651..d7f129386aa0 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -163,3 +163,4 @@ pub type Time64NanosecondBuilder = PrimitiveBuilder; // --------------------- Array Equality --------------------- pub use self::equal::ArrayEqual; +pub use self::equal::JsonEqual; From d3ba8092c8edeae0d7c75dcd01096d6787316a1a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 3 Oct 2019 14:43:58 -0600 Subject: [PATCH 184/201] ARROW-6091: [Rust] [DataFusion] Implement physical execution plan for LIMIT Closes #5509 from andygrove/ARROW-6091 and squashes the following commits: cb5c62298 bump nightly version 00078c782 minor optimization 0c916028f test passes 40e617f28 unit test 3cb4cab57 start roughing out LIMIT Authored-by: Andy Grove Signed-off-by: Andy Grove --- rust/datafusion/src/execution/limit.rs | 18 +- .../src/execution/physical_plan/limit.rs | 208 ++++++++++++++++++ .../src/execution/physical_plan/mod.rs | 1 + 3 files changed, 213 insertions(+), 14 deletions(-) create mode 100644 rust/datafusion/src/execution/physical_plan/limit.rs diff --git a/rust/datafusion/src/execution/limit.rs b/rust/datafusion/src/execution/limit.rs index 11c9a2db8f82..84b3ae8f76ca 100644 --- a/rust/datafusion/src/execution/limit.rs +++ b/rust/datafusion/src/execution/limit.rs @@ -21,12 +21,11 @@ use std::cell::RefCell; use std::rc::Rc; use std::sync::Arc; -use arrow::array::*; -use arrow::compute::array_ops::limit; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; -use crate::error::{ExecutionError, Result}; +use crate::error::Result; +use crate::execution::physical_plan::limit::truncate_batch; use crate::execution::relation::Relation; /// Implementation of a LIMIT relation @@ -67,18 +66,9 @@ impl Relation for LimitRelation { return Ok(None); } - if batch.num_rows() >= capacity { - let limited_columns: Result> = (0..batch.num_columns()) - .map(|i| match limit(batch.column(i), capacity) { - Ok(result) => Ok(result), - Err(error) => Err(ExecutionError::from(error)), - }) - .collect(); - - let limited_batch: RecordBatch = - RecordBatch::try_new(self.schema.clone(), limited_columns?)?; + if batch.num_rows() > capacity { + let limited_batch = truncate_batch(&batch, capacity)?; self.num_consumed_rows += capacity; - Ok(Some(limited_batch)) } else { self.num_consumed_rows += batch.num_rows(); diff --git a/rust/datafusion/src/execution/physical_plan/limit.rs b/rust/datafusion/src/execution/physical_plan/limit.rs new file mode 100644 index 000000000000..87e77f97743b --- /dev/null +++ b/rust/datafusion/src/execution/physical_plan/limit.rs @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines the LIMIT plan + +use crate::error::{ExecutionError, Result}; +use crate::execution::physical_plan::common::RecordBatchIterator; +use crate::execution::physical_plan::ExecutionPlan; +use crate::execution::physical_plan::{BatchIterator, Partition}; +use arrow::array::ArrayRef; +use arrow::compute::array_ops::limit; +use arrow::datatypes::Schema; +use arrow::record_batch::RecordBatch; +use std::sync::{Arc, Mutex}; +use std::thread; +use std::thread::JoinHandle; + +/// Limit execution plan +pub struct LimitExec { + /// Input schema + schema: Arc, + /// Input partitions + partitions: Vec>, + /// Maximum number of rows to return + limit: usize, +} + +impl LimitExec { + /// Create a new MergeExec + pub fn new( + schema: Arc, + partitions: Vec>, + limit: usize, + ) -> Self { + LimitExec { + schema, + partitions, + limit, + } + } +} + +impl ExecutionPlan for LimitExec { + fn schema(&self) -> Arc { + self.schema.clone() + } + + fn partitions(&self) -> Result>> { + Ok(vec![Arc::new(LimitPartition { + schema: self.schema.clone(), + partitions: self.partitions.clone(), + limit: self.limit, + })]) + } +} + +struct LimitPartition { + /// Input schema + schema: Arc, + /// Input partitions + partitions: Vec>, + /// Maximum number of rows to return + limit: usize, +} + +impl Partition for LimitPartition { + fn execute(&self) -> Result>> { + // collect up to "limit" rows on each partition + let threads: Vec>>> = self + .partitions + .iter() + .map(|p| { + let p = p.clone(); + let limit = self.limit; + thread::spawn(move || { + let it = p.execute()?; + collect_with_limit(it, limit) + }) + }) + .collect(); + + // combine the results from each thread, up to the limit + let mut combined_results: Vec> = vec![]; + let mut count = 0; + for thread in threads { + let join = thread.join().expect("Failed to join thread"); + let result = join?; + for batch in result { + let capacity = self.limit - count; + if batch.num_rows() <= capacity { + count += batch.num_rows(); + combined_results.push(Arc::new(batch.clone())) + } else { + let batch = truncate_batch(&batch, capacity)?; + count += batch.num_rows(); + combined_results.push(Arc::new(batch.clone())) + } + if count == self.limit { + break; + } + } + } + + Ok(Arc::new(Mutex::new(RecordBatchIterator::new( + self.schema.clone(), + combined_results, + )))) + } +} + +/// Truncate a RecordBatch to maximum of n rows +pub fn truncate_batch(batch: &RecordBatch, n: usize) -> Result { + let limited_columns: Result> = (0..batch.num_columns()) + .map(|i| match limit(batch.column(i), n) { + Ok(result) => Ok(result), + Err(error) => Err(ExecutionError::from(error)), + }) + .collect(); + + Ok(RecordBatch::try_new( + batch.schema().clone(), + limited_columns?, + )?) +} + +/// Create a vector of record batches from an iterator +fn collect_with_limit( + it: Arc>, + limit: usize, +) -> Result> { + let mut count = 0; + let mut it = it.lock().unwrap(); + let mut results: Vec = vec![]; + loop { + match it.next() { + Ok(Some(batch)) => { + let capacity = limit - count; + if batch.num_rows() <= capacity { + count += batch.num_rows(); + results.push(batch); + } else { + let batch = truncate_batch(&batch, capacity)?; + count += batch.num_rows(); + results.push(batch); + } + if count == limit { + return Ok(results); + } + } + Ok(None) => { + // end of result set + return Ok(results); + } + Err(e) => return Err(e), + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::execution::physical_plan::common; + use crate::execution::physical_plan::csv::CsvExec; + use crate::test; + + #[test] + fn limit() -> Result<()> { + let schema = test::aggr_test_schema(); + + let num_partitions = 4; + let path = + test::create_partitioned_csv("aggregate_test_100.csv", num_partitions)?; + + let csv = CsvExec::try_new(&path, schema.clone(), true, None, 1024)?; + + // input should have 4 partitions + let input = csv.partitions()?; + assert_eq!(input.len(), num_partitions); + + let limit = LimitExec::new(schema.clone(), input, 7); + let partitions = limit.partitions()?; + + // the result should contain 4 batches (one per input partition) + let iter = partitions[0].execute()?; + let batches = common::collect(iter)?; + + // there should be a total of 100 rows + let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum(); + assert_eq!(row_count, 7); + + Ok(()) + } +} diff --git a/rust/datafusion/src/execution/physical_plan/mod.rs b/rust/datafusion/src/execution/physical_plan/mod.rs index f0c34c228db0..b820af86b269 100644 --- a/rust/datafusion/src/execution/physical_plan/mod.rs +++ b/rust/datafusion/src/execution/physical_plan/mod.rs @@ -86,6 +86,7 @@ pub mod csv; pub mod datasource; pub mod expressions; pub mod hash_aggregate; +pub mod limit; pub mod merge; pub mod projection; pub mod selection; From e0efdbd28c0f1804ce6047a29dfbc5ca8cc5b02e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Oct 2019 15:58:04 -0500 Subject: [PATCH 185/201] ARROW-6634: [C++] Vendor Flatbuffers and check in compiled sources This patch removes Flatbuffers as a build pre-requisite (or an ExternalProject that must be downloaded and built) I added convenience scripts to build-support to update the compiled sources and to re-vendor Flatbuffers when we want to upgrade versions in the future. Closes #5464 from wesm/cpp-vendor-flatbuffers and squashes the following commits: e0ae41814 Code review comments, rebase 464590f16 Vendor Flatbuffers, remove flatbuffers_ep Authored-by: Wes McKinney Signed-off-by: Wes McKinney --- ci/PKGBUILD | 3 - ci/appveyor-cpp-setup-mingw.bat | 1 - ci/conda_env_cpp.yml | 1 - cpp/CMakeLists.txt | 3 + cpp/build-support/lint_cpp_cli.py | 3 +- cpp/build-support/update-flatbuffers.sh | 39 + cpp/build-support/vendor-flatbuffers.sh | 31 + cpp/cmake_modules/FindFlatbuffersAlt.cmake | 56 - cpp/cmake_modules/ThirdpartyToolchain.cmake | 114 +- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/flight/CMakeLists.txt | 2 - cpp/src/arrow/gpu/CMakeLists.txt | 2 - cpp/src/arrow/gpu/cuda_arrow_ipc.cc | 3 +- cpp/src/arrow/ipc/CMakeLists.txt | 39 - cpp/src/arrow/ipc/feather.cc | 5 +- cpp/src/arrow/ipc/feather_internal.h | 5 +- cpp/src/arrow/ipc/feather_test.cc | 3 +- cpp/src/arrow/ipc/message.cc | 3 +- cpp/src/arrow/ipc/metadata_internal.cc | 9 +- cpp/src/arrow/ipc/metadata_internal.h | 5 +- cpp/src/arrow/ipc/read_write_test.cc | 3 +- cpp/src/arrow/ipc/reader.cc | 7 +- cpp/src/arrow/util/ubsan.h | 3 +- cpp/src/generated/File_generated.h | 181 + cpp/src/generated/Message_generated.h | 508 +++ cpp/src/generated/Schema_generated.h | 2045 ++++++++++ cpp/src/generated/SparseTensor_generated.h | 644 ++++ cpp/src/generated/Tensor_generated.h | 377 ++ cpp/src/generated/feather_generated.h | 839 +++++ cpp/src/plasma/CMakeLists.txt | 32 - cpp/src/plasma/{format => }/common.fbs | 0 cpp/src/plasma/common_generated.h | 227 ++ cpp/src/plasma/format/.gitignore | 1 - cpp/src/plasma/{format => }/plasma.fbs | 0 cpp/src/plasma/plasma_generated.h | 3331 +++++++++++++++++ cpp/src/plasma/protocol.cc | 2 +- .../flatbuffers/include/flatbuffers/base.h | 379 ++ .../include/flatbuffers/flatbuffers.h | 2613 +++++++++++++ .../include/flatbuffers/stl_emulation.h | 275 ++ cpp/thirdparty/versions.txt | 2 - dev/container/Dockerfile | 2 +- dev/docker_common/Dockerfile.xenial.base | 1 - dev/release/rat_exclude_files.txt | 3 + dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 1 - docs/source/developers/cpp.rst | 4 +- 45 files changed, 11531 insertions(+), 277 deletions(-) create mode 100755 cpp/build-support/update-flatbuffers.sh create mode 100755 cpp/build-support/vendor-flatbuffers.sh delete mode 100644 cpp/cmake_modules/FindFlatbuffersAlt.cmake create mode 100644 cpp/src/generated/File_generated.h create mode 100644 cpp/src/generated/Message_generated.h create mode 100644 cpp/src/generated/Schema_generated.h create mode 100644 cpp/src/generated/SparseTensor_generated.h create mode 100644 cpp/src/generated/Tensor_generated.h create mode 100644 cpp/src/generated/feather_generated.h rename cpp/src/plasma/{format => }/common.fbs (100%) create mode 100644 cpp/src/plasma/common_generated.h delete mode 100644 cpp/src/plasma/format/.gitignore rename cpp/src/plasma/{format => }/plasma.fbs (100%) create mode 100644 cpp/src/plasma/plasma_generated.h create mode 100644 cpp/thirdparty/flatbuffers/include/flatbuffers/base.h create mode 100644 cpp/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h create mode 100644 cpp/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h diff --git a/ci/PKGBUILD b/ci/PKGBUILD index 76b3a1588dd4..37b24b478668 100644 --- a/ci/PKGBUILD +++ b/ci/PKGBUILD @@ -93,9 +93,6 @@ build() { -Ddouble-conversion_ROOT="${MINGW_PREFIX}" \ -DThrift_ROOT="${MINGW_PREFIX}" - # Workaround for a compile error that appears with GCC 4.9.3 - sed -i 's/-fPIC/ /g' flatbuffers_ep-prefix/src/flatbuffers_ep-stamp/flatbuffers_ep-configure-RELEASE.cmake - make popd } diff --git a/ci/appveyor-cpp-setup-mingw.bat b/ci/appveyor-cpp-setup-mingw.bat index b58f8ee61e65..ca3639858daf 100644 --- a/ci/appveyor-cpp-setup-mingw.bat +++ b/ci/appveyor-cpp-setup-mingw.bat @@ -40,7 +40,6 @@ pacman --sync --noconfirm ^ %MINGW_PACKAGE_PREFIX%-brotli ^ %MINGW_PACKAGE_PREFIX%-cmake ^ %MINGW_PACKAGE_PREFIX%-double-conversion ^ - %MINGW_PACKAGE_PREFIX%-flatbuffers ^ %MINGW_PACKAGE_PREFIX%-gflags ^ %MINGW_PACKAGE_PREFIX%-gobject-introspection ^ %MINGW_PACKAGE_PREFIX%-grpc ^ diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml index 2aff2cb72ffb..46507bf0e7d7 100644 --- a/ci/conda_env_cpp.yml +++ b/ci/conda_env_cpp.yml @@ -23,7 +23,6 @@ bzip2 c-ares cmake double-conversion -flatbuffers gflags glog gmock>=1.8.1 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 640d1843521c..1193b2e1dbfb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -448,6 +448,9 @@ message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") include_directories(${CMAKE_CURRENT_BINARY_DIR}/src) include_directories(src) +# Compiled flatbuffers files +include_directories(src/generated) + # # Visibility # diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py index f6f5d506ff30..f725742c34d7 100644 --- a/cpp/build-support/lint_cpp_cli.py +++ b/cpp/build-support/lint_cpp_cli.py @@ -87,7 +87,8 @@ def lint_file(path): gandiva/jni jni/ test - internal''') + internal + _generated''') def lint_files(): diff --git a/cpp/build-support/update-flatbuffers.sh b/cpp/build-support/update-flatbuffers.sh new file mode 100755 index 000000000000..b7acdd8979be --- /dev/null +++ b/cpp/build-support/update-flatbuffers.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Run this from cpp/ directory. flatc is expected to be in your path + +CWD="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +SOURCE_DIR=$CWD/../src +FORMAT_DIR=$CWD/../.. + +flatc -c -o $SOURCE_DIR/generated \ + $FORMAT_DIR/Message.fbs \ + $FORMAT_DIR/File.fbs \ + $FORMAT_DIR/Schema.fbs \ + $FORMAT_DIR/Tensor.fbs \ + $FORMAT_DIR/SparseTensor.fbs \ + src/arrow/ipc/feather.fbs + +flatc -c -o $SOURCE_DIR/plasma \ + --gen-object-api \ + --scoped-enums \ + $SOURCE_DIR/plasma/common.fbs \ + $SOURCE_DIR/plasma/plasma.fbs diff --git a/cpp/build-support/vendor-flatbuffers.sh b/cpp/build-support/vendor-flatbuffers.sh new file mode 100755 index 000000000000..94a99e826829 --- /dev/null +++ b/cpp/build-support/vendor-flatbuffers.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -eu + +# Run this from cpp/ directory with $FLATBUFFERS_HOME set to location of your +# Flatbuffers installation +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" + +VENDOR_LOCATION=$SOURCE_DIR/../thirdparty/flatbuffers/include/flatbuffers +mkdir -p $VENDOR_LOCATION +cp -f $FLATBUFFERS_HOME/include/flatbuffers/base.h $VENDOR_LOCATION +cp -f $FLATBUFFERS_HOME/include/flatbuffers/flatbuffers.h $VENDOR_LOCATION +cp -f $FLATBUFFERS_HOME/include/flatbuffers/stl_emulation.h $VENDOR_LOCATION diff --git a/cpp/cmake_modules/FindFlatbuffersAlt.cmake b/cpp/cmake_modules/FindFlatbuffersAlt.cmake deleted file mode 100644 index c551c1c55825..000000000000 --- a/cpp/cmake_modules/FindFlatbuffersAlt.cmake +++ /dev/null @@ -1,56 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if(Flatbuffers_ROOT) - find_library(FLATBUFFERS_LIB - NAMES flatbuffers - PATHS ${Flatbuffers_ROOT} - NO_DEFAULT_PATH - PATH_SUFFIXES ${LIB_PATH_SUFFIXES}) - find_path(FLATBUFFERS_INCLUDE_DIR - NAMES flatbuffers/flatbuffers.h - PATHS ${Flatbuffers_ROOT} - NO_DEFAULT_PATH - PATH_SUFFIXES ${INCLUDE_PATH_SUFFIXES}) - find_program(FLATC - NAMES flatc - PATHS ${Flatbuffers_ROOT} - NO_DEFAULT_PATH - PATH_SUFFIXES "bin") -else() - find_library(FLATBUFFERS_LIB NAMES flatbuffers PATH_SUFFIXES ${LIB_PATH_SUFFIXES}) - find_path(FLATBUFFERS_INCLUDE_DIR - NAMES flatbuffers/flatbuffers.h - PATH_SUFFIXES ${INCLUDE_PATH_SUFFIXES}) - find_program(FLATC NAMES flatc PATHS PATH_SUFFIXES "bin") -endif() - -find_package_handle_standard_args(FlatbuffersAlt - REQUIRED_VARS - FLATBUFFERS_LIB - FLATBUFFERS_INCLUDE_DIR - FLATC) - -if(FlatbuffersAlt_FOUND) - add_library(flatbuffers::flatbuffers UNKNOWN IMPORTED) - set_target_properties(flatbuffers::flatbuffers - PROPERTIES IMPORTED_LOCATION "${FLATBUFFERS_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${FLATBUFFERS_INCLUDE_DIR}") - add_executable(flatbuffers::flatc IMPORTED) - set_target_properties(flatbuffers::flatc PROPERTIES IMPORTED_LOCATION "${FLATC}") -endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ace517124b65..44bc639f82c0 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -61,7 +61,6 @@ set(ARROW_THIRDPARTY_DEPENDENCIES BZip2 c-ares double-conversion - Flatbuffers gflags GLOG gRPC @@ -141,8 +140,6 @@ macro(build_dependency DEPENDENCY_NAME) build_bzip2() elseif("${DEPENDENCY_NAME}" STREQUAL "c-ares") build_cares() - elseif("${DEPENDENCY_NAME}" STREQUAL "Flatbuffers") - build_flatbuffers() elseif("${DEPENDENCY_NAME}" STREQUAL "gflags") build_gflags() elseif("${DEPENDENCY_NAME}" STREQUAL "GLOG") @@ -203,6 +200,9 @@ endmacro() set(THIRDPARTY_DIR "${arrow_SOURCE_DIR}/thirdparty") +# Include vendored Flatbuffers +include_directories(SYSTEM "${THIRDPARTY_DIR}/flatbuffers/include") + # ---------------------------------------------------------------------- # Some EP's require other EP's @@ -221,10 +221,6 @@ if(ARROW_FLIGHT) set(ARROW_WITH_URIPARSER ON) endif() -if(ARROW_FLIGHT OR ARROW_IPC) - set(ARROW_WITH_FLATBUFFERS ON) -endif() - if(ARROW_JSON) set(ARROW_WITH_RAPIDJSON ON) endif() @@ -300,13 +296,6 @@ else() ) endif() -if(DEFINED ENV{ARROW_FLATBUFFERS_URL}) - set(FLATBUFFERS_SOURCE_URL "$ENV{ARROW_FLATBUFFERS_URL}") -else() - set(FLATBUFFERS_SOURCE_URL - "https://github.com/google/flatbuffers/archive/${FLATBUFFERS_VERSION}.tar.gz") -endif() - if(DEFINED ENV{ARROW_GBENCHMARK_URL}) set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}") else() @@ -1783,103 +1772,6 @@ if(ARROW_WITH_RAPIDJSON) include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) endif() -macro(build_flatbuffers) - message(STATUS "Building flatbuffers from source") - set(FLATBUFFERS_PREFIX - "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_ep-prefix/src/flatbuffers_ep-install") - if(MSVC) - set(FLATBUFFERS_CMAKE_CXX_FLAGS /EHsc) - else() - set(FLATBUFFERS_CMAKE_CXX_FLAGS -fPIC) - endif() - set(FLATBUFFERS_COMPILER "${FLATBUFFERS_PREFIX}/bin/flatc") - set( - FLATBUFFERS_STATIC_LIB - "${FLATBUFFERS_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}flatbuffers${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - # We always need to do release builds, otherwise flatc will not be installed. - externalproject_add(flatbuffers_ep - URL ${FLATBUFFERS_SOURCE_URL} - BUILD_BYPRODUCTS ${FLATBUFFERS_COMPILER} ${FLATBUFFERS_STATIC_LIB} - CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} - "-DCMAKE_BUILD_TYPE=RELEASE" - "-DCMAKE_CXX_FLAGS=${FLATBUFFERS_CMAKE_CXX_FLAGS}" - "-DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_PREFIX}" - "-DFLATBUFFERS_BUILD_TESTS=OFF" - ${EP_LOG_OPTIONS}) - - file(MAKE_DIRECTORY "${FLATBUFFERS_PREFIX}/include") - - add_library(flatbuffers::flatbuffers STATIC IMPORTED) - set_target_properties(flatbuffers::flatbuffers - PROPERTIES IMPORTED_LOCATION "${FLATBUFFERS_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${FLATBUFFERS_PREFIX}/include") - add_executable(flatbuffers::flatc IMPORTED) - set_target_properties(flatbuffers::flatc - PROPERTIES IMPORTED_LOCATION "${FLATBUFFERS_COMPILER}") - - add_dependencies(toolchain flatbuffers_ep) - add_dependencies(flatbuffers::flatbuffers flatbuffers_ep) - add_dependencies(flatbuffers::flatc flatbuffers_ep) -endmacro() - -if(ARROW_WITH_FLATBUFFERS) - if(Flatbuffers_SOURCE STREQUAL "AUTO") - find_package(Flatbuffers QUIET) - # Older versions of Flatbuffers (that are not built using CMake) - # don't install a FlatbuffersConfig.cmake - # This is only supported from 1.10+ on, we support at least 1.7 - if(NOT Flatbuffers_FOUND) - find_package(FlatbuffersAlt) - endif() - if(NOT Flatbuffers_FOUND AND NOT FlatbuffersAlt_FOUND) - build_flatbuffers() - endif() - elseif(Flatbuffers_SOURCE STREQUAL "BUNDLED") - build_flatbuffers() - elseif(Flatbuffers_SOURCE STREQUAL "SYSTEM") - find_package(Flatbuffers QUIET) - if(NOT Flatbuffers_FOUND) - find_package(FlatbuffersAlt REQUIRED) - endif() - endif() - - if(TARGET flatbuffers::flatbuffers_shared AND NOT TARGET flatbuffers::flatbuffers) - get_target_property(FLATBUFFERS_INCLUDE_DIR flatbuffers::flatbuffers_shared - INTERFACE_INCLUDE_DIRECTORIES) - get_target_property(FLATBUFFERS_SHARED_LIB flatbuffers::flatbuffers_shared - IMPORTED_LOCATION) - add_library(flatbuffers::flatbuffers SHARED IMPORTED) - set_target_properties(flatbuffers::flatbuffers - PROPERTIES IMPORTED_LOCATION "${FLATBUFFERS_SHARED_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${FLATBUFFERS_INCLUDE_DIR}") - endif() - - if(TARGET flatbuffers::flatc) - get_target_property(FLATBUFFERS_COMPILER_LOCATION_CONFIG flatbuffers::flatc - IMPORTED_LOCATION_${UPPERCASE_BUILD_TYPE}) - get_target_property(FLATBUFFERS_COMPILER_LOCATION flatbuffers::flatc - IMPORTED_LOCATION) - get_target_property(FLATBUFFERS_COMPILER_LOCATION_NOCONFIG flatbuffers::flatc - IMPORTED_LOCATION_NOCONFIG) - # mingw-w64-flatbuffers provides location only for "noconfig" - if(NOT FLATBUFFERS_COMPILER_LOCATION_CONFIG - AND NOT FLATBUFFERS_COMPILER_LOCATION - AND FLATBUFFERS_COMPILER_LOCATION_NOCONFIG) - set_target_properties(flatbuffers::flatc - PROPERTIES IMPORTED_LOCATION - "${FLATBUFFERS_COMPILER_LOCATION_NOCONFIG}") - endif() - endif() - - # TODO: Don't use global includes but rather target_include_directories - get_target_property(FLATBUFFERS_INCLUDE_DIR flatbuffers::flatbuffers - INTERFACE_INCLUDE_DIRECTORIES) - include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) -endif() - macro(build_zlib) message(STATUS "Building ZLIB from source") set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8aaf5f549fdc..721affc59cf7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -267,7 +267,6 @@ if(ARROW_IPC) list( APPEND ARROW_SRCS ipc/json_integration.cc ipc/json_internal.cc ipc/json_simple.cc) endif() - add_dependencies(arrow_dependencies metadata_fbs) endif() if(ARROW_JSON) diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index bc40ec24f5b3..2cd2084b638a 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -94,7 +94,6 @@ add_arrow_lib(arrow_flight ${ARROW_FLIGHT_SRCS} DEPENDENCIES flight_grpc_gen - metadata_fbs SHARED_LINK_FLAGS ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt SHARED_LINK_LIBS @@ -119,7 +118,6 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS OR ARROW_BUILD_INTEGRATION) GTest::GTest flight_grpc_gen arrow_dependencies - metadata_fbs SHARED_LINK_LIBS arrow_shared arrow_flight_shared diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 98edc59ef4c1..57ca43aa804e 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -43,8 +43,6 @@ add_arrow_lib(arrow_cuda ${ARROW_CUDA_SRCS} OUTPUTS ARROW_CUDA_LIBRARIES - DEPENDENCIES - metadata_fbs SHARED_LINK_FLAGS ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt SHARED_LINK_LIBS diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc index 0fb81bcc5a43..d0679dd71e13 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc @@ -23,7 +23,6 @@ #include #include "arrow/buffer.h" -#include "arrow/ipc/Message_generated.h" #include "arrow/ipc/dictionary.h" #include "arrow/ipc/message.h" #include "arrow/ipc/reader.h" @@ -32,6 +31,8 @@ #include "arrow/status.h" #include "arrow/util/visibility.h" +#include "generated/Message_generated.h" + #include "arrow/gpu/cuda_context.h" #include "arrow/gpu/cuda_memory.h" diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index bf31df5ac6de..9450e4bfdd8c 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -38,45 +38,6 @@ elseif(ARROW_BUILD_INTEGRATION) add_dependencies(arrow-integration arrow-json-integration-test) endif() -# make clean will delete the generated file -set_source_files_properties(Message_generated.h PROPERTIES GENERATED TRUE) -set_source_files_properties(feather_generated.h PROPERTIES GENERATED TRUE) -set_source_files_properties(File_generated.h PROPERTIES GENERATED TRUE) - -set(OUTPUT_DIR ${ARROW_BINARY_DIR}/src/arrow/ipc) -set(FBS_OUTPUT_FILES "${OUTPUT_DIR}/File_generated.h" "${OUTPUT_DIR}/Message_generated.h" - "${OUTPUT_DIR}/feather_generated.h") - -set(FBS_SRC - ${ARROW_SOURCE_DIR}/../format/Message.fbs - ${ARROW_SOURCE_DIR}/../format/File.fbs - ${ARROW_SOURCE_DIR}/../format/Schema.fbs - ${ARROW_SOURCE_DIR}/../format/Tensor.fbs - ${ARROW_SOURCE_DIR}/../format/SparseTensor.fbs - ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs) - -foreach(FIL ${FBS_SRC}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - list(APPEND ABS_FBS_SRC ${ABS_FIL}) -endforeach() - -get_target_property(FLATC_EXECUTABLE flatbuffers::flatc IMPORTED_LOCATION) -if(NOT FLATC_EXECUTABLE) - get_target_property(FLATC_EXECUTABLE flatbuffers::flatc IMPORTED_LOCATION_RELEASE) -endif() -message(STATUS "flatc: ${FLATC_EXECUTABLE}") -add_custom_command(OUTPUT ${FBS_OUTPUT_FILES} - COMMAND ${FLATC_EXECUTABLE} - -c - -o - ${OUTPUT_DIR} - ${ABS_FBS_SRC} - DEPENDS flatbuffers::flatc ${ABS_FBS_SRC} - COMMENT "Running flatc compiler on ${ABS_FBS_SRC}" - VERBATIM) - -add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) - # Headers: top level arrow_install_all_headers("arrow/ipc") diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 2fc29ebf91c3..cbd8b2d318b8 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -25,12 +25,11 @@ #include #include -#include "flatbuffers/flatbuffers.h" +#include #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/io/interfaces.h" -#include "arrow/ipc/feather_generated.h" #include "arrow/ipc/feather_internal.h" #include "arrow/ipc/util.h" // IWYU pragma: keep #include "arrow/status.h" @@ -42,6 +41,8 @@ #include "arrow/util/logging.h" #include "arrow/visitor.h" +#include "generated/feather_generated.h" + namespace arrow { using internal::checked_cast; diff --git a/cpp/src/arrow/ipc/feather_internal.h b/cpp/src/arrow/ipc/feather_internal.h index 2aa04b2db72b..53f84d7a99fe 100644 --- a/cpp/src/arrow/ipc/feather_internal.h +++ b/cpp/src/arrow/ipc/feather_internal.h @@ -27,13 +27,14 @@ #include #include -#include "flatbuffers/flatbuffers.h" +#include #include "arrow/buffer.h" #include "arrow/ipc/feather.h" -#include "arrow/ipc/feather_generated.h" #include "arrow/type.h" +#include "generated/feather_generated.h" + namespace arrow { namespace ipc { namespace feather { diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index fb7ccb568687..d96b2dca6aec 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -23,7 +23,6 @@ #include "arrow/array.h" #include "arrow/io/memory.h" -#include "arrow/ipc/feather_generated.h" #include "arrow/ipc/feather_internal.h" #include "arrow/ipc/test_common.h" #include "arrow/memory_pool.h" @@ -35,6 +34,8 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" +#include "generated/feather_generated.h" + namespace arrow { class Buffer; diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 87066320d4ed..e490b29df5d7 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -27,13 +27,14 @@ #include "arrow/buffer.h" #include "arrow/io/interfaces.h" -#include "arrow/ipc/Message_generated.h" #include "arrow/ipc/metadata_internal.h" #include "arrow/ipc/util.h" #include "arrow/status.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" +#include "generated/Message_generated.h" + namespace arrow { namespace ipc { diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index d4ed8b7e0da0..a37b49c7c7a1 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -28,10 +28,6 @@ #include "arrow/array.h" #include "arrow/extension_type.h" #include "arrow/io/interfaces.h" -#include "arrow/ipc/File_generated.h" // IWYU pragma: keep -#include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/SparseTensor_generated.h" // IWYU pragma: keep -#include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" #include "arrow/sparse_tensor.h" @@ -44,6 +40,11 @@ #include "arrow/util/ubsan.h" #include "arrow/visitor_inline.h" +#include "generated/File_generated.h" // IWYU pragma: keep +#include "generated/Message_generated.h" +#include "generated/SparseTensor_generated.h" // IWYU pragma: keep +#include "generated/Tensor_generated.h" // IWYU pragma: keep + namespace arrow { namespace flatbuf = org::apache::arrow::flatbuf; diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 803d67df46c4..82678bd4ee1a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -29,14 +29,15 @@ #include #include "arrow/buffer.h" -#include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/Schema_generated.h" #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" #include "arrow/memory_pool.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" +#include "generated/Message_generated.h" +#include "generated/Schema_generated.h" + namespace arrow { class DataType; diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 318527a01829..ba9e4ac5aaa6 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -31,7 +31,6 @@ #include "arrow/io/file.h" #include "arrow/io/memory.h" #include "arrow/io/test_common.h" -#include "arrow/ipc/Message_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/metadata_internal.h" #include "arrow/ipc/reader.h" @@ -50,6 +49,8 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" +#include "generated/Message_generated.h" // IWYU pragma: keep + namespace arrow { using internal::checked_cast; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4fc74d18efb6..a459f5bf4053 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -31,9 +31,6 @@ #include "arrow/buffer.h" #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" -#include "arrow/ipc/File_generated.h" // IWYU pragma: export -#include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/Schema_generated.h" #include "arrow/ipc/dictionary.h" #include "arrow/ipc/message.h" #include "arrow/ipc/metadata_internal.h" @@ -46,6 +43,10 @@ #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" +#include "generated/File_generated.h" // IWYU pragma: export +#include "generated/Message_generated.h" +#include "generated/Schema_generated.h" + using arrow::internal::checked_pointer_cast; namespace arrow { diff --git a/cpp/src/arrow/util/ubsan.h b/cpp/src/arrow/util/ubsan.h index fe5322a3e830..9f5a42595bc4 100644 --- a/cpp/src/arrow/util/ubsan.h +++ b/cpp/src/arrow/util/ubsan.h @@ -40,7 +40,8 @@ static uint8_t non_null_filler; /// method to wrap locations where this could happen. /// /// Note: Flatbuffers has UBSan warnings if a zero length vector is passed. -/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve them. +/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve +/// them. template inline T* MakeNonNull(T* maybe_null) { if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) { diff --git a/cpp/src/generated/File_generated.h b/cpp/src/generated/File_generated.h new file mode 100644 index 000000000000..37e2f9c7ea4a --- /dev/null +++ b/cpp/src/generated/File_generated.h @@ -0,0 +1,181 @@ +// automatically generated by the FlatBuffers compiler, do not modify + + +#ifndef FLATBUFFERS_GENERATED_FILE_ORG_APACHE_ARROW_FLATBUF_H_ +#define FLATBUFFERS_GENERATED_FILE_ORG_APACHE_ARROW_FLATBUF_H_ + +#include "flatbuffers/flatbuffers.h" + +#include "Schema_generated.h" + +namespace org { +namespace apache { +namespace arrow { +namespace flatbuf { + +struct Footer; + +struct Block; + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Block FLATBUFFERS_FINAL_CLASS { + private: + int64_t offset_; + int32_t metaDataLength_; + int32_t padding0__; + int64_t bodyLength_; + + public: + Block() { + memset(static_cast(this), 0, sizeof(Block)); + } + Block(int64_t _offset, int32_t _metaDataLength, int64_t _bodyLength) + : offset_(flatbuffers::EndianScalar(_offset)), + metaDataLength_(flatbuffers::EndianScalar(_metaDataLength)), + padding0__(0), + bodyLength_(flatbuffers::EndianScalar(_bodyLength)) { + (void)padding0__; + } + /// Index to the start of the RecordBlock (note this is past the Message header) + int64_t offset() const { + return flatbuffers::EndianScalar(offset_); + } + /// Length of the metadata + int32_t metaDataLength() const { + return flatbuffers::EndianScalar(metaDataLength_); + } + /// Length of the data (this is aligned so there can be a gap between this and + /// the metatdata). + int64_t bodyLength() const { + return flatbuffers::EndianScalar(bodyLength_); + } +}; +FLATBUFFERS_STRUCT_END(Block, 24); + +/// ---------------------------------------------------------------------- +/// Arrow File metadata +/// +struct Footer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_VERSION = 4, + VT_SCHEMA = 6, + VT_DICTIONARIES = 8, + VT_RECORDBATCHES = 10 + }; + MetadataVersion version() const { + return static_cast(GetField(VT_VERSION, 0)); + } + const Schema *schema() const { + return GetPointer(VT_SCHEMA); + } + const flatbuffers::Vector *dictionaries() const { + return GetPointer *>(VT_DICTIONARIES); + } + const flatbuffers::Vector *recordBatches() const { + return GetPointer *>(VT_RECORDBATCHES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_VERSION) && + VerifyOffset(verifier, VT_SCHEMA) && + verifier.VerifyTable(schema()) && + VerifyOffset(verifier, VT_DICTIONARIES) && + verifier.VerifyVector(dictionaries()) && + VerifyOffset(verifier, VT_RECORDBATCHES) && + verifier.VerifyVector(recordBatches()) && + verifier.EndTable(); + } +}; + +struct FooterBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_version(MetadataVersion version) { + fbb_.AddElement(Footer::VT_VERSION, static_cast(version), 0); + } + void add_schema(flatbuffers::Offset schema) { + fbb_.AddOffset(Footer::VT_SCHEMA, schema); + } + void add_dictionaries(flatbuffers::Offset> dictionaries) { + fbb_.AddOffset(Footer::VT_DICTIONARIES, dictionaries); + } + void add_recordBatches(flatbuffers::Offset> recordBatches) { + fbb_.AddOffset(Footer::VT_RECORDBATCHES, recordBatches); + } + explicit FooterBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + FooterBuilder &operator=(const FooterBuilder &); + flatbuffers::Offset