diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 1cb263c44..00905378a 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -36,7 +36,9 @@ namespace iceberg { Schema::Schema(std::vector fields, int32_t schema_id) - : StructType(std::move(fields)), schema_id_(schema_id) {} + : StructType(std::move(fields)), + schema_id_(schema_id), + cache_(std::make_unique(this)) {} Result> Schema::Make(std::vector fields, int32_t schema_id, @@ -156,14 +158,14 @@ bool Schema::Equals(const Schema& other) const { Result>> Schema::FindFieldByName( std::string_view name, bool case_sensitive) const { if (case_sensitive) { - ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this)); + ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, cache_->GetNameIdMap()); auto it = name_id_map.get().name_to_id.find(name); if (it == name_id_map.get().name_to_id.end()) { return std::nullopt; }; return FindFieldById(it->second); } - ICEBERG_ASSIGN_OR_RAISE(auto lowercase_name_to_id, lowercase_name_to_id_.Get(*this)); + ICEBERG_ASSIGN_OR_RAISE(auto lowercase_name_to_id, cache_->GetLowercaseNameToIdMap()); auto it = lowercase_name_to_id.get().find(StringUtils::ToLower(name)); if (it == lowercase_name_to_id.get().end()) { return std::nullopt; @@ -171,39 +173,9 @@ Result>> Schema::FindFie return FindFieldById(it->second); } -Result>> -Schema::InitIdToFieldMap(const Schema& self) { - std::unordered_map> id_to_field; - IdToFieldVisitor visitor(id_to_field); - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor)); - return id_to_field; -} - -Result Schema::InitNameIdMap(const Schema& self) { - NameIdMap name_id_map; - NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name, - /*case_sensitive=*/true); - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/"")); - visitor.Finish(); - return name_id_map; -} - -Result>> -Schema::InitLowerCaseNameToIdMap(const Schema& self) { - std::unordered_map> - lowercase_name_to_id; - NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr, - /*case_sensitive=*/false); - ICEBERG_RETURN_UNEXPECTED( - VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/"")); - visitor.Finish(); - return lowercase_name_to_id; -} - Result>> Schema::FindFieldById( int32_t field_id) const { - ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, id_to_field_.Get(*this)); + ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, cache_->GetIdToFieldMap()); auto it = id_to_field.get().find(field_id); if (it == id_to_field.get().end()) { return std::nullopt; @@ -213,7 +185,7 @@ Result>> Schema::FindFie Result> Schema::FindColumnNameById( int32_t field_id) const { - ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this)); + ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, cache_->GetNameIdMap()); auto it = name_id_map.get().id_to_name.find(field_id); if (it == name_id_map.get().id_to_name.end()) { return std::nullopt; @@ -221,30 +193,9 @@ Result> Schema::FindColumnNameById( return it->second; } -Result>> Schema::InitIdToPositionPath( - const Schema& self) { - PositionPathVisitor visitor; - ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor)); - return visitor.Finish(); -} - -Result Schema::InitHighestFieldId(const Schema& self) { - ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, self.id_to_field_.Get(self)); - - if (id_to_field.get().empty()) { - return kInitialColumnId; - } - - auto max_it = std::ranges::max_element( - id_to_field.get(), - [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); - - return max_it->first; -} - Result> Schema::GetAccessorById( int32_t field_id) const { - ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path, id_to_position_path_.Get(*this)); + ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path, cache_->GetIdToPositionPathMap()); if (auto it = id_to_position_path.get().find(field_id); it != id_to_position_path.get().cend()) { ICEBERG_ASSIGN_OR_RAISE(auto field, FindFieldById(field_id)); @@ -322,7 +273,7 @@ Result> Schema::IdentifierFieldNames() const { return names; } -Result Schema::HighestFieldId() const { return highest_field_id_.Get(*this); } +Result Schema::HighestFieldId() const { return cache_->GetHighestFieldId(); } bool Schema::SameSchema(const Schema& other) const { return fields_ == other.fields_ && identifier_field_ids_ == other.identifier_field_ids_; @@ -330,7 +281,7 @@ bool Schema::SameSchema(const Schema& other) const { Status Schema::Validate(int32_t format_version) const { // Get all fields including nested ones - ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, id_to_field_.Get(*this)); + ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, cache_->GetIdToFieldMap()); // Check each field's type and defaults for (const auto& [field_id, field_ref] : id_to_field.get()) { @@ -351,4 +302,75 @@ Status Schema::Validate(int32_t format_version) const { return {}; } +Result SchemaCache::GetIdToFieldMap() const { + return id_to_field_.Get(schema_); +} + +Result SchemaCache::GetNameIdMap() const { + return name_id_map_.Get(schema_); +} + +Result SchemaCache::GetLowercaseNameToIdMap() + const { + return lowercase_name_to_id_.Get(schema_); +} + +Result SchemaCache::GetIdToPositionPathMap() const { + return id_to_position_path_.Get(schema_); +} + +Result SchemaCache::GetHighestFieldId() const { + return highest_field_id_.Get(schema_); +} + +Result SchemaCache::InitIdToFieldMap(const Schema* schema) { + std::unordered_map> id_to_field; + IdToFieldVisitor visitor(id_to_field); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*schema, &visitor)); + return id_to_field; +} + +Result SchemaCache::InitNameIdMap(const Schema* schema) { + NameIdMap name_id_map; + NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name, + /*case_sensitive=*/true); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*schema, &visitor, /*path=*/"", /*short_path=*/"")); + visitor.Finish(); + return name_id_map; +} + +Result SchemaCache::InitLowerCaseNameToIdMap( + const Schema* schema) { + std::unordered_map> + lowercase_name_to_id; + NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr, + /*case_sensitive=*/false); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*schema, &visitor, /*path=*/"", /*short_path=*/"")); + visitor.Finish(); + return lowercase_name_to_id; +} + +Result SchemaCache::InitIdToPositionPath( + const Schema* schema) { + PositionPathVisitor visitor; + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*schema, &visitor)); + return visitor.Finish(); +} + +Result SchemaCache::InitHighestFieldId(const Schema* schema) { + ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, InitIdToFieldMap(schema)); + + if (id_to_field.empty()) { + return Schema::kInitialColumnId; + } + + auto max_it = std::ranges::max_element( + id_to_field, + [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); + + return max_it->first; +} + } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 1445a7934..bcaccaa15 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,8 @@ namespace iceberg { +class SchemaCache; + /// \brief A schema for a Table. /// /// A schema is a list of typed columns, along with a unique integer ID. A @@ -187,6 +190,22 @@ class ICEBERG_EXPORT Schema : public StructType { /// \brief Compare two schemas for equality. bool Equals(const Schema& other) const; + const int32_t schema_id_; + // Field IDs that uniquely identify rows in the table. + std::vector identifier_field_ids_; + // Cache for schema mappings to facilitate fast lookups. + std::unique_ptr cache_; +}; + +// Cache for schema mappings to facilitate fast lookups. +class ICEBERG_EXPORT SchemaCache { + public: + explicit SchemaCache(const Schema* schema) : schema_(schema) {} + + using IdToFieldMap = + std::unordered_map>; + using IdToFieldMapRef = std::reference_wrapper; + struct NameIdMap { /// \brief Mapping from canonical field name to ID /// @@ -201,28 +220,38 @@ class ICEBERG_EXPORT Schema : public StructType { /// 'list.element.field' instead of 'list.field'. std::unordered_map id_to_name; }; + using NameIdMapRef = std::reference_wrapper; - static Result>> - InitIdToFieldMap(const Schema&); - static Result InitNameIdMap(const Schema&); - static Result>> - InitLowerCaseNameToIdMap(const Schema&); - static Result>> InitIdToPositionPath( - const Schema&); - static Result InitHighestFieldId(const Schema&); + using LowercaseNameToIdMap = + std::unordered_map>; + using LowercaseNameToIdMapRef = std::reference_wrapper; - const int32_t schema_id_; - /// Field IDs that uniquely identify rows in the table. - std::vector identifier_field_ids_; - /// Mapping from field id to field. + using IdToPositionPathMap = std::unordered_map>; + using IdToPositionPathMapRef = std::reference_wrapper; + + Result GetIdToFieldMap() const; + Result GetNameIdMap() const; + Result GetLowercaseNameToIdMap() const; + Result GetIdToPositionPathMap() const; + Result GetHighestFieldId() const; + + private: + static Result InitIdToFieldMap(const Schema* schema); + static Result InitNameIdMap(const Schema* schema); + static Result InitLowerCaseNameToIdMap(const Schema* schema); + static Result InitIdToPositionPath(const Schema* schema); + static Result InitHighestFieldId(const Schema* schema); + + const Schema* schema_; + // Mapping from field id to field. Lazy id_to_field_; - /// Mapping from field name to field id. + // Mapping from field name to field id. Lazy name_id_map_; - /// Mapping from lowercased field name to field id + // Mapping from lowercased field name to field id. Lazy lowercase_name_to_id_; - /// Mapping from field id to (nested) position path to access the field. + // Mapping from field id to (nested) position path to access the field. Lazy id_to_position_path_; - /// Highest field ID in the schema. + // Highest field ID in the schema. Lazy highest_field_id_; }; diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index f421e8381..5b6aabc10 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -85,15 +85,15 @@ bool Snapshot::Equals(const Snapshot& other) const { schema_id == other.schema_id; } -Result CachedSnapshot::InitManifestsCache( - const Snapshot& snapshot, std::shared_ptr file_io) { +Result SnapshotCache::InitManifestsCache( + const Snapshot* snapshot, std::shared_ptr file_io) { if (file_io == nullptr) { return InvalidArgument("Cannot cache manifests: FileIO is null"); } // Read manifest list ICEBERG_ASSIGN_OR_RAISE(auto reader, - ManifestListReader::Make(snapshot.manifest_list, file_io)); + ManifestListReader::Make(snapshot->manifest_list, file_io)); ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, reader->Files()); std::vector manifests; @@ -118,21 +118,21 @@ Result CachedSnapshot::InitManifestsCache( return std::make_pair(std::move(manifests), data_manifests_count); } -Result> CachedSnapshot::Manifests( +Result> SnapshotCache::Manifests( std::shared_ptr file_io) const { ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, file_io)); auto& cache = cache_ref.get(); return std::span(cache.first.data(), cache.first.size()); } -Result> CachedSnapshot::DataManifests( +Result> SnapshotCache::DataManifests( std::shared_ptr file_io) const { ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, file_io)); auto& cache = cache_ref.get(); return std::span(cache.first.data(), cache.second); } -Result> CachedSnapshot::DeleteManifests( +Result> SnapshotCache::DeleteManifests( std::shared_ptr file_io) const { ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, file_io)); auto& cache = cache_ref.get(); diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 3889607f9..86a7fe20d 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -265,13 +265,13 @@ struct ICEBERG_EXPORT Snapshot { /// \brief A snapshot with cached manifest loading capabilities. /// -/// This class wraps a Snapshot reference and provides lazy-loading of manifests. -class ICEBERG_EXPORT CachedSnapshot { +/// This class wraps a Snapshot pointer and provides lazy-loading of manifests. +class ICEBERG_EXPORT SnapshotCache { public: - explicit CachedSnapshot(const Snapshot& snapshot) : snapshot_(snapshot) {} + explicit SnapshotCache(const Snapshot* snapshot) : snapshot_(snapshot) {} /// \brief Get the underlying Snapshot reference - const Snapshot& snapshot() const { return snapshot_; } + const Snapshot& snapshot() const { return *snapshot_; } /// \brief Returns all ManifestFile instances for either data or delete manifests /// in this snapshot. @@ -303,11 +303,11 @@ class ICEBERG_EXPORT CachedSnapshot { /// \param snapshot The snapshot to initialize the manifests cache for /// \param file_io The FileIO instance to use for reading the manifest list /// \return A result containing the manifests cache - static Result InitManifestsCache(const Snapshot& snapshot, + static Result InitManifestsCache(const Snapshot* snapshot, std::shared_ptr file_io); /// The underlying snapshot data - const Snapshot& snapshot_; + const Snapshot* snapshot_; /// Lazy-loaded manifests cache Lazy manifests_cache_;