Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 57 additions & 37 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,40 +90,64 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
return "UNKNOWN";
}

namespace {

StatisticsMinMaxField GetStatisticsMinMaxField(const ColumnDescriptor& descr) {
switch (descr.column_order().get_order()) {
case ColumnOrder::TYPE_DEFINED_ORDER:
return descr.sort_order() != SortOrder::UNKNOWN
? StatisticsMinMaxField::kMinValueMaxValue
: StatisticsMinMaxField::kInvalid;
case ColumnOrder::UNDEFINED:
return descr.sort_order() == SortOrder::SIGNED
? StatisticsMinMaxField::kLegacyMinMax
: StatisticsMinMaxField::kInvalid;
case ColumnOrder::UNKNOWN:
return StatisticsMinMaxField::kInvalid;
}
return StatisticsMinMaxField::kInvalid;
}

template <typename DType>
static std::shared_ptr<Statistics> MakeTypedColumnStats(
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr,
::arrow::MemoryPool* pool) {
std::optional<bool> min_exact =
metadata.statistics.__isset.is_min_value_exact
? std::optional<bool>(metadata.statistics.is_min_value_exact)
: std::nullopt;
std::optional<bool> max_exact =
metadata.statistics.__isset.is_max_value_exact
? std::optional<bool>(metadata.statistics.is_max_value_exact)
: std::nullopt;
// If ColumnOrder is defined, return max_value and min_value
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
return MakeStatistics<DType>(
descr, metadata.statistics.min_value, metadata.statistics.max_value,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool);
}
// Default behavior
std::shared_ptr<Statistics> MakeTypedColumnStats(const format::ColumnMetaData& metadata,
const ColumnDescriptor* descr,
::arrow::MemoryPool* pool) {
const auto& statistics = metadata.statistics;
const std::string kEmpty = "";
const std::string* encoded_min = &kEmpty;
const std::string* encoded_max = &kEmpty;
bool has_min_max = false;
std::optional<bool> min_exact = std::nullopt;
std::optional<bool> max_exact = std::nullopt;

switch (GetStatisticsMinMaxField(*descr)) {
case StatisticsMinMaxField::kMinValueMaxValue:
encoded_min = &statistics.min_value;
encoded_max = &statistics.max_value;
has_min_max = statistics.__isset.max_value && statistics.__isset.min_value;
min_exact = statistics.__isset.is_min_value_exact
? std::optional<bool>(statistics.is_min_value_exact)
: std::nullopt;
max_exact = statistics.__isset.is_max_value_exact
? std::optional<bool>(statistics.is_max_value_exact)
: std::nullopt;
break;
case StatisticsMinMaxField::kLegacyMinMax:
encoded_min = &statistics.min;
encoded_max = &statistics.max;
has_min_max = statistics.__isset.max && statistics.__isset.min;
break;
case StatisticsMinMaxField::kInvalid:
break;
}

return MakeStatistics<DType>(
descr, metadata.statistics.min, metadata.statistics.max,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
min_exact, max_exact, pool);
descr, *encoded_min, *encoded_max, metadata.num_values - statistics.null_count,
statistics.null_count, statistics.distinct_count, has_min_max,
statistics.__isset.null_count, statistics.__isset.distinct_count, min_exact,
max_exact, pool);
}

namespace {

std::shared_ptr<geospatial::GeoStatistics> MakeColumnGeometryStats(
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
if (metadata.__isset.geospatial_statistics) {
Expand Down Expand Up @@ -336,12 +360,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
{
const std::lock_guard<std::mutex> guard(stats_mutex_);
if (possible_encoded_stats_ == nullptr) {
possible_encoded_stats_ =
std::make_shared<EncodedStatistics>(FromThrift(column_metadata_->statistics));
if (descr_->sort_order() == SortOrder::UNKNOWN) {
// If the column SortOrder is Unknown we can't trust max/min.
possible_encoded_stats_->ClearMinMax();
}
possible_encoded_stats_ = std::make_shared<EncodedStatistics>(
FromThrift(column_metadata_->statistics, GetStatisticsMinMaxField(*descr_)));
}
Comment thread
wgtmac marked this conversation as resolved.
}
return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_,
Expand Down Expand Up @@ -1037,7 +1057,7 @@ class FileMetaData::FileMetaDataImpl {
if (column_order.__isset.TYPE_ORDER) {
column_orders.push_back(ColumnOrder::type_defined_);
} else {
column_orders.push_back(ColumnOrder::undefined_);
column_orders.push_back(ColumnOrder::unknown_);
}
}
} else {
Expand Down
98 changes: 98 additions & 0 deletions cpp/src/parquet/metadata_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,104 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_TRUE(f_accessor_1->Equals(*f_accessor->Subset({2, 0})));
}

namespace {

std::string EncodeInt32(int32_t value) {
return std::string(reinterpret_cast<const char*>(&value), sizeof(value));
}

constexpr int32_t kLegacyMin = 100, kLegacyMax = 200;

std::string SerializeMetadata(const format::FileMetaData& thrift_metadata) {
std::string out;
ThriftSerializer{}.SerializeToString(&thrift_metadata, &out);
return out;
}

std::shared_ptr<FileMetaData> ParseMetadata(std::string serialized_metadata) {
uint32_t decoded_len = static_cast<uint32_t>(serialized_metadata.size());
return FileMetaData::Make(serialized_metadata.data(), &decoded_len);
}

format::FileMetaData SingleInt32MetadataWithStats() {
format::FileMetaData metadata;
format::SchemaElement root, leaf;
schema::NodeVector fields = {schema::Int32("int_col", Repetition::REQUIRED)};
schema::GroupNode::Make("schema", Repetition::REPEATED, fields)->ToParquet(&root);
fields.back()->ToParquet(&leaf);
metadata.schema = {std::move(root), std::move(leaf)};

auto& column = metadata.row_groups.emplace_back().columns.emplace_back();
column.__isset.meta_data = true;
auto& column_metadata = column.meta_data;
column_metadata.__set_type(format::Type::INT32);
column_metadata.__isset.statistics = true;
auto& statistics = column_metadata.statistics;
statistics.__set_min(EncodeInt32(kLegacyMin));
statistics.__set_max(EncodeInt32(kLegacyMax));
statistics.__set_min_value(EncodeInt32(kLegacyMin));
statistics.__set_max_value(EncodeInt32(kLegacyMax));
metadata.column_orders.emplace_back().__set_TYPE_ORDER(format::TypeDefinedOrder{});
metadata.__isset.column_orders = true;
return metadata;
}

std::unique_ptr<ColumnChunkMetaData> GetOnlyColumnChunk(
const FileMetaData& metadata, ColumnOrder::type expected_order) {
EXPECT_EQ(expected_order, metadata.schema()->Column(0)->column_order().get_order());
return metadata.RowGroup(0)->ColumnChunk(0);
}

void AssertColumnChunkHasNoMinMax(const FileMetaData& metadata,
ColumnOrder::type expected_order) {
auto column = GetOnlyColumnChunk(metadata, expected_order);
ASSERT_NE(nullptr, column->encoded_statistics());
EXPECT_FALSE(column->encoded_statistics()->has_min);
EXPECT_FALSE(column->encoded_statistics()->has_max);
ASSERT_NE(nullptr, column->statistics());
EXPECT_FALSE(column->statistics()->HasMinMax());
}

void AssertColumnChunkMinMax(const FileMetaData& metadata,
ColumnOrder::type expected_order, int32_t min, int32_t max) {
auto column = GetOnlyColumnChunk(metadata, expected_order);
const std::string encoded_min = EncodeInt32(min);
const std::string encoded_max = EncodeInt32(max);

ASSERT_NE(nullptr, column->encoded_statistics());
EXPECT_EQ(encoded_min, column->encoded_statistics()->min());
EXPECT_EQ(encoded_max, column->encoded_statistics()->max());
ASSERT_NE(nullptr, column->statistics());
EXPECT_EQ(encoded_min, column->statistics()->EncodeMin());
EXPECT_EQ(encoded_max, column->statistics()->EncodeMax());
}

} // namespace

TEST(Metadata, UnknownColumnOrderIgnoresMinMax) {
format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats();
// Simulate an unsupported ColumnOrder value: unknown union fields are skipped by
// Thrift, leaving an entry with no known field set.
thrift_metadata.column_orders.clear();
thrift_metadata.column_orders.emplace_back();
thrift_metadata.__isset.column_orders = true;

auto metadata = ParseMetadata(SerializeMetadata(thrift_metadata));
AssertColumnChunkHasNoMinMax(*metadata, ColumnOrder::UNKNOWN);
}
Comment thread
wgtmac marked this conversation as resolved.

TEST(Metadata, MissingColumnOrderUsesLegacyMinMax) {
format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats();
thrift_metadata.column_orders.clear();
thrift_metadata.__isset.column_orders = false;
auto& statistics = thrift_metadata.row_groups.at(0).columns.at(0).meta_data.statistics;
statistics.__set_min_value(EncodeInt32(kLegacyMin - 100));
statistics.__set_max_value(EncodeInt32(kLegacyMax + 100));

auto metadata = ParseMetadata(SerializeMetadata(thrift_metadata));
AssertColumnChunkMinMax(*metadata, ColumnOrder::UNDEFINED, kLegacyMin, kLegacyMax);
}

TEST(Metadata, TestV1Version) {
// PARQUET-839
parquet::schema::NodeVector fields;
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/parquet/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,19 @@ class PARQUET_EXPORT ColumnDescriptor {
return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
}

// Whether ColumnOrder-governed min/max values have a supported ordering.
bool can_use_stats() const {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would this be better called can_use_min_max? some stats like null_count could still be used, right?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good idea.

switch (column_order().get_order()) {
case ColumnOrder::TYPE_DEFINED_ORDER:
return sort_order() != SortOrder::UNKNOWN;
case ColumnOrder::UNDEFINED:
return sort_order() == SortOrder::SIGNED;
case ColumnOrder::UNKNOWN:
return false;
}
return false;
Comment thread
wgtmac marked this conversation as resolved.
}
Comment thread
wgtmac marked this conversation as resolved.

const std::string& name() const { return primitive_node_->name(); }

const std::shared_ptr<schema::ColumnPath> path() const;
Expand Down
26 changes: 26 additions & 0 deletions cpp/src/parquet/schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,32 @@ TEST(TestColumnDescriptor, TestAttrs) {
ASSERT_EQ(expected_descr, descr2.ToString());
}

TEST(TestColumnDescriptor, CanUseStats) {
NodePtr node = Int32("name");
ColumnDescriptor descr(node, 0, 0);
// Type-defined column order is usable when the type has a known sort order.
EXPECT_TRUE(descr.can_use_stats());

auto primitive_node = std::static_pointer_cast<PrimitiveNode>(node);
primitive_node->SetColumnOrder(ColumnOrder::undefined_);
// Missing column order falls back to legacy min/max, which are signed.
EXPECT_TRUE(ColumnDescriptor(node, 0, 0).can_use_stats());

primitive_node->SetColumnOrder(ColumnOrder::unknown_);
// Unsupported column order means min/max ordering is unknown to this reader.
EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats());

node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::BYTE_ARRAY);
primitive_node = std::static_pointer_cast<PrimitiveNode>(node);
primitive_node->SetColumnOrder(ColumnOrder::undefined_);
// Legacy min/max are signed, so they cannot represent unsigned byte ordering.
EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats());

node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::INT96);
// INT96 has no defined sort order in the Parquet type-defined ordering.
EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats());
}
Comment thread
wgtmac marked this conversation as resolved.

class TestSchemaDescriptor : public ::testing::Test {
public:
void setUp() {}
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/parquet/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,13 @@ TEST(TestStatisticsSortOrder, UNKNOWN) {
ASSERT_EQ(1, enc_stats->null_count);
ASSERT_FALSE(enc_stats->is_max_value_exact.has_value());
ASSERT_FALSE(enc_stats->is_min_value_exact.has_value());

// Unknown sort order should not cause min/max to be set
std::shared_ptr<Statistics> stats = column_chunk->statistics();
ASSERT_NE(nullptr, stats);
ASSERT_FALSE(stats->HasMinMax());
ASSERT_TRUE(stats->HasNullCount());
ASSERT_EQ(1, stats->null_count());
}

// Test statistics for binary column with UNSIGNED sort order
Expand Down
29 changes: 22 additions & 7 deletions cpp/src/parquet/thrift_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,12 +252,21 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
aesGcmCtrV1.supply_aad_prefix};
}

static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
// Selects which thrift Statistics min/max fields should populate EncodedStatistics.
enum class StatisticsMinMaxField {
// Do not populate min/max, because the ordering is undefined or unsupported.
kInvalid,
// Populate min/max from the min_value/max_value fields.
kMinValueMaxValue,
// Populate min/max from the legacy min/max fields.
kLegacyMinMax,
};

static inline EncodedStatistics FromThrift(const format::Statistics& stats,
StatisticsMinMaxField min_max) {
EncodedStatistics out;

// Use the new V2 min-max statistics over the former one if it is filled
if (stats.__isset.max_value || stats.__isset.min_value) {
// TODO: check if the column_order is TYPE_DEFINED_ORDER.
if (min_max == StatisticsMinMaxField::kMinValueMaxValue) {
if (stats.__isset.max_value) {
out.set_max(stats.max_value);
if (stats.__isset.is_max_value_exact) {
Expand All @@ -270,9 +279,7 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
out.is_min_value_exact = stats.is_min_value_exact;
}
}
} else if (stats.__isset.max || stats.__isset.min) {
// TODO: check created_by to see if it is corrupted for some types.
// TODO: check if the sort_order is SIGNED.
} else if (min_max == StatisticsMinMaxField::kLegacyMinMax) {
if (stats.__isset.max) {
out.set_max(stats.max);
}
Expand All @@ -290,6 +297,14 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
return out;
}

static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
// Use the new V2 min-max statistics over the former one if it is filled.
if (stats.__isset.max_value || stats.__isset.min_value) {
return FromThrift(stats, StatisticsMinMaxField::kMinValueMaxValue);
}
return FromThrift(stats, StatisticsMinMaxField::kLegacyMinMax);
}

static inline geospatial::EncodedGeoStatistics FromThrift(
const format::GeospatialStatistics& geo_stats) {
geospatial::EncodedGeoStatistics out;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_t

ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
ColumnOrder ColumnOrder::unknown_ = ColumnOrder(ColumnOrder::UNKNOWN);

// Static methods for LogicalType class

Expand Down
10 changes: 9 additions & 1 deletion cpp/src/parquet/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -599,14 +599,22 @@ bool PageCanUseChecksum(PageType::type pageType);

class ColumnOrder {
public:
enum type { UNDEFINED, TYPE_DEFINED_ORDER };
enum type {
// File metadata has no column order, only legacy min/max in stats are defined.
UNDEFINED,
// File metadata uses TypeDefinedOrder from the Parquet format.
TYPE_DEFINED_ORDER,
// Column order value unsupported by this reader.
UNKNOWN
};
explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
// Default to Type Defined Order
ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
ColumnOrder::type get_order() { return column_order_; }

static ColumnOrder undefined_;
static ColumnOrder type_defined_;
static ColumnOrder unknown_;

private:
ColumnOrder::type column_order_;
Expand Down
Loading