Ubuntu - boost serialization incompatibility between 1.74 and 1.83 with multi index container

NeelBasu
March 3, 2024
191 views
0 votes
2 Answers

I am serializing a set of contents, indexed by multiple properties using boost::multi_index_container, and a params struct, into a binary archive which I want to deserialize latter. But the archive created using boost 1.74 is unreadable (Invalid or corrupted archive) when read using boost 1.83.

I have included an mre inside in the git repo. Although it is a single small cpp file I made a repo to share it with the CMakeLists.txt and the Dockerfile. Following is my content

struct content{
    friend class boost::serialization::access;
    using angle_type = std::size_t;

    inline content(angle_type angle): _angle(angle) {}
    inline angle_type angle() const { return _angle; }
    void reset_angle_random(){
        static std::random_device dev;
        static std::mt19937 rng_angle(dev());
        std::uniform_int_distribution<> angle_dist(0, 180);
        _angle = angle_dist(rng_angle);
    }
    void freeze(){
        // complicated deterministic business logic
        _angle = 0;
    }
    content frozen() const{
        mre::content copy(*this);
        copy.freeze();
        return copy;
    }

    static content generate(){
        static std::random_device  dev;
        static std::mt19937        rng(dev());
        std::uniform_real_distribution<> dist_length(-0.5f, 0.5f);

        mre::content content{0};
        content._length = dist_length(rng);
        content.reset_angle_random();
        return content;
    }

    template<class Archive>
    void serialize(Archive & ar, const unsigned int version) {
        ar & boost::serialization::make_nvp("length",  _length);
        ar & boost::serialization::make_nvp("angle", _angle);
    }

    friend std::size_t hash_value(content const& c){
        std::size_t seed = 0;
        boost::hash_combine(seed, c._length);
        boost::hash_combine(seed, c._angle);
        return seed;
    }

    inline std::size_t hash() const { return boost::hash<mre::content>{}(*this); }
    inline std::size_t frozen_id() const { return frozen().hash(); }
    inline std::string id() const { return (boost::format("%1%~%2%-%3%") % frozen_id() % hash() % angle()).str(); }
    inline bool operator<(const content& other) const { return id() < other.id(); }
    private:
        double _length;
        angle_type _angle;

    private:
        content() = default;
};

The actual code I am working on is much larger and does not use the content struct mentioned here. The above mentioned content struct is a highly reduced version to make a minimal reproducible example. Following is my multi index container setup.

struct package{
    friend class boost::serialization::access;

    struct tags{
        struct id{};
        struct content{};
        struct angle{};
        struct frozen{};
    };

    using container = boost::multi_index_container<
        mre::content,
        boost::multi_index::indexed_by<
            boost::multi_index::ordered_unique<boost::multi_index::identity<mre::content>>,
            boost::multi_index::ordered_unique<boost::multi_index::tag<tags::id>, boost::multi_index::const_mem_fun<mre::content, std::string, &mre::content::id>>,
            boost::multi_index::ordered_non_unique<boost::multi_index::tag<tags::content>, boost::multi_index::const_mem_fun<mre::content, std::size_t, &mre::content::hash>>,
            boost::multi_index::ordered_non_unique<boost::multi_index::tag<tags::angle>, boost::multi_index::const_mem_fun<mre::content, mre::content::angle_type, &mre::content::angle>>,
            boost::multi_index::ordered_non_unique<boost::multi_index::tag<tags::frozen>, boost::multi_index::const_mem_fun<mre::content, std::size_t, &mre::content::frozen_id>>
        >
    >;

    inline explicit package(const mre::parameters& params): _loaded(false), _parameters(params) {}
    inline explicit package(): _loaded(false) {}
    void save(const std::string& filename) const;
    void load(const std::string& filename);
    inline std::size_t size() const { return _samples.size(); }
    inline bool loaded() const { return _loaded; }
    const mre::content& operator[](const std::string& id) const;
    const mre::parameters& params() const { return _parameters; }
    template<class Archive>
    void serialize(Archive & ar, const unsigned int version) {
        ar & boost::serialization::make_nvp("samples", _samples);
        ar & boost::serialization::make_nvp("params",  _parameters);
    }

    public:
        std::size_t generate(std::size_t contents, std::size_t angles);
    private:
        bool _loaded;
        container  _samples;
        mre::parameters _parameters;
};

I am also serializing a set of paremeters mentioned below.

struct parameters{
    std::size_t degree;
    std::size_t frame_size;

    template<class Archive>
    void serialize(Archive & ar, const unsigned int version) {
        ar & boost::serialization::make_nvp("degree",     degree);
        ar & boost::serialization::make_nvp("frame_size", frame_size);
    }
};

Saving, loading and generating are done as following

void mre::package::save(const std::string& filename) const {
    std::ofstream stream(filename, std::ios::binary);
    try{
        boost::archive::binary_oarchive out(stream, boost::archive::no_tracking);
        std::cout << "serialization library version: " << out.get_library_version() << std::endl;
        out << *this;
    } catch(const std::exception& e){
        std::cout << "Error saving archive: " << e.what() << std::endl;
    }
    stream.close();
}

void mre::package::load(const std::string& filename){
    std::ifstream stream(filename, std::ios::binary);
    try{
        boost::archive::binary_iarchive in(stream, boost::archive::no_tracking);
        std::cout << "serialization library version: " << in.get_library_version() << std::endl;
        in >> *this;
        _loaded = true;
    } catch(const std::exception& e){
        std::cout << "Error loading archive: " << e.what() << std::endl;
    }
    stream.close();
}

std::size_t mre::package::generate(std::size_t contents, std::size_t angles){
    std::size_t count = 0;
    std::size_t v_content = 0;
    while(v_content++ < contents){
        mre::content x = mre::content::generate();
        std::size_t v_angle = 0;
        while(v_angle++ < angles){
            mre::content x_angle = x;
            x_angle.reset_angle_random(); // commenting out this line makes it work
            if (_samples.insert(x_angle).second)
                ++count;
        }
    }
    return count;
}

It looks like a bug in boost multi index container. But I am unaware of any such existing bugs. I can reproduce the problem by compiling the mre in an Arch linux machine which has latest version of boost libraries. The mre also contains a docker target which compiles the same into an ubuntu 22.04 image in which default boost version is 1.74. The issue can be tested using the executable mre as following.

cd build
cmake .. && make
./mre pack archive_name 10 # to serialize 10 randomly generated contents and save to file named archive_name 
./mre unpack archive_name # to de-serialize

I order to test the incompatibility it can be compiled using docker.

make docker # compiles and generates a file named arc inside build/archives directory of the host machine
./mre unpack archives/arc # which throws exception

Answers

Looking at this a long time, I couldn’t see it. However, by fixing the seeds and verifying that we get deterministic data, I noticed that the results were "identical" but for the order.

I noticed the default index already relies on the hash indirectly, multiple times:

                      ┌─────────────────────┐
                      │                     │
                  ┌−−−−−−−−−−−−−−−−−┐       │
                  ╎      both       ╎       │
                  ╎                 ╎       ▼
┌───────────┐     ╎ ┌─────────────┐ ╎     ┌────────┐     ┌───────────────┐     ┌───────────────────────┐
│ operator< │ ──▶ ╎ │    id()     │ ╎ ──▶ │ hash() │ ──▶ │ boost::hash<> │ ──▶ │ boost::hash_combine() │
└───────────┘     ╎ └─────────────┘ ╎     └────────┘     └───────────────┘     └───────────────────────┘
                  ╎   │             ╎
                  ╎   │             ╎
                  ╎   ▼             ╎
                  ╎ ┌─────────────┐ ╎
                  ╎ │ frozen_id() │ ╎
                  ╎ └─────────────┘ ╎
                  ╎                 ╎
                  └−−−−−−−−−−−−−−−−−┘

Since the first index is actually also unique, and the only constituent parts are hashes and the angle, this might cause different uniqueness across version of Boost ContainerHash.

Boost hash_combine does not guarantee stability or portability. In fact, most common hash functions don’t, e.g. std::hash:

The actual hash functions are implementation-dependent and are not required to fulfill any other quality criteria except those specified above

In fact persisting information depending on deterministic hash is a logic error anywhere, unless you’re only re-reading the same information in the same process, because

Hash functions are only required to produce the same result for the same input within a single execution of a program; this allows salted hashes that prevent collision denial-of-service attacks.

Specifically, hash_combine has received many changes between 1.74 and 1.83. You should rethink your indexes. In fact, I would consider it a smell that a hash depending on non-unique hashes is being used as the key (identity) to a unique index.

Fixing?

To avoid violating the total ordering contract that the index expects (it’s basically like you edited the key fields by "editing" the hash function), I’d expect the hash to be something like

inline constexpr std::tuple<double, angle_type> key() const { return {_length, _angle}; }
friend std::size_t hash_value(content const& c) { return boost::hash_value(c.key()); }

And then perhaps something more like:

    using key_type = std::tuple<double, angle_type>;
    key_type      key() const                  { return {_length, _angle}; }
    key_type      frozen_key() const           { return frozen().key(); }

    friend size_t hash_value(content const& c) { return boost::hash_value(c.key()); }
    size_t        hash() const                 { return hash_value(*this); }

    auto idkey() const                         { return std::tuple(frozen_key(), key(), angle()); }
    std::string id() const                     { return fmt::format("{}", idkey()); }
    bool operator<(content const& other) const { return idkey() < other.idkey(); }

Where I substituted libfmt for Boost Format, because it can directly format tuples without me doing the work 🙂

Basically, I’d not throw away the information, which seemed like code smell anyways, but also caused the indexes to rely on non-deterministic functions.

Here’s my motivating code listing, complete with the tweaks to optionally use a fixed seed:

Live On Coliru

#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/text_oarchive.hpp>
#include <boost/container_hash/hash.hpp>
#include <boost/core/nvp.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/multi_index/key.hpp>
#include <boost/multi_index/ordered_index.hpp>
#include <boost/multi_index_container.hpp>
#include <boost/serialization/utility.hpp>
#include <boost/serialization/vector.hpp>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <random>
#include <string>

#include <fmt/format.h>
#include <fmt/ranges.h>

static constexpr boost::archive::archive_flags FLAGS = boost::archive::archive_flags::no_tracking; //  {};

#if 1
    using IA = boost::archive::binary_iarchive;
    using OA = boost::archive::binary_oarchive;
#else
    using IA = boost::archive::text_iarchive;
    using OA = boost::archive::text_oarchive;
#endif

namespace mre {

    struct content {
        friend class boost::serialization::access;
        using angle_type = std::size_t;

        inline content(angle_type angle) : _angle(angle) {}
        inline angle_type angle() const { return _angle; }

        void reset_angle_random(size_t seed) {
            static std::mt19937             rng_angle(seed);
            std::uniform_int_distribution<> angle_dist(0, 180);
            _angle = angle_dist(rng_angle);
        }

        void freeze() {
            // complicated deterministic business logic
            _angle = 0;
        }
        content frozen() const {
            mre::content copy(*this);
            copy.freeze();
            return copy;
        }

        static content generate() {
            return generate(std::random_device{}());
        }
        static content generate(size_t seed) {
            static std::mt19937              rng(seed);
            std::uniform_real_distribution<> dist_length(-0.5f, 0.5f);

            mre::content content{0};
            content._length = dist_length(rng);
            content.reset_angle_random(rng());
            return content;
        }

        template <class Archive> void serialize(Archive& ar, unsigned) {
            ar& boost::serialization::make_nvp("length", _length);
            ar& boost::serialization::make_nvp("angle", _angle);
        }

        using key_type = std::tuple<double, angle_type>;
        key_type      key() const                  { return {_length, _angle}; }
        key_type      frozen_key() const           { return frozen().key(); }

        friend size_t hash_value(content const& c) { return boost::hash_value(c.key()); }
        size_t        hash() const                 { return hash_value(*this); }

        auto idkey() const                         { return std::tuple(frozen_key(), key(), angle()); }
        std::string id() const                     { return fmt::format("{}", idkey()); }
        bool operator<(content const& other) const { return idkey() < other.idkey(); }

      private:
        double     _length;
        angle_type _angle;

      private:
        content() = default;
    };

    struct parameters {
        std::size_t degree;
        std::size_t frame_size;

        template <class Archive> void serialize(Archive& ar, unsigned) {
            ar& boost::serialization::make_nvp("degree", degree);
            ar& boost::serialization::make_nvp("frame_size", frame_size);
        }
    };

    std::ostream& operator<<(std::ostream& stream, mre::parameters const& params);

    namespace bmi = boost::multi_index;

    struct package {
        friend class boost::serialization::access;

        struct tags {
            struct id {};
            struct content {};
            struct angle {};
            struct frozen {};
        };

        using container = bmi::multi_index_container<
            mre::content,
            bmi::indexed_by<
                bmi::ordered_unique<bmi::identity<mre::content>>,
                bmi::ordered_unique<bmi::tag<tags::id>,          bmi::key<&mre::content::id>>,
                bmi::ordered_non_unique<bmi::tag<tags::content>, bmi::key<&mre::content::hash>>,
                bmi::ordered_non_unique<bmi::tag<tags::angle>,   bmi::key< &mre::content::angle>>,
                bmi::ordered_non_unique<bmi::tag<tags::frozen>,  bmi::key<&mre::content::frozen_key>>>>;

        inline explicit package(mre::parameters const& params) : _loaded(false), _parameters(params) {}
        inline explicit package() : _loaded(false) {}
        void                          save(std::string const& filename) const;
        void                          load(std::string const& filename);
        inline std::size_t            size() const { return _samples.size(); }
        inline bool                   loaded() const { return _loaded; }
        mre::content const&           operator[](std::string const& id) const;
        mre::parameters const&        params() const { return _parameters; }
        template <class Archive> void serialize(Archive& ar, unsigned) {
            ar& boost::serialization::make_nvp("samples", _samples);
            ar& boost::serialization::make_nvp("params", _parameters);
        }

      public:
        std::size_t generate(std::size_t contents, std::size_t angles, size_t seed);
        std::size_t generate(std::size_t contents, std::size_t angles) {
            return generate(contents, angles, std::random_device{}());
        }

      private:
        bool            _loaded;
        container       _samples;
        mre::parameters _parameters;
    };

} // namespace mre

// { sources

std::ostream& mre::operator<<(std::ostream& stream, mre::parameters const& params) {
    stream << "params {" << std::endl;
    stream << "    degree:     " << params.degree << std::endl;
    stream << "    frame_size: " << params.frame_size << std::endl;
    stream << "}";
    return stream;
}

void mre::package::save(std::string const& filename) const {
    std::ofstream stream(filename, std::ios::binary);
    try {
        OA out(stream, FLAGS);
        std::cout << "serialization library version: " << out.get_library_version() << std::endl;
        out << *this;
    } catch (std::exception const& e) {
        std::cout << "Error saving archive: " << e.what() << std::endl;
    }
    stream.close();
}

void mre::package::load(std::string const& filename) {
    std::ifstream stream(filename, std::ios::binary);
    try {
        IA in(stream, FLAGS);
        std::cout << "deserialization library version: " << in.get_library_version() << std::endl;
        in >> *this;
        _loaded = true;
    } catch (std::exception const& e) {
        std::cout << "Error loading archive: " << e.what() << std::endl;
    }
    stream.close();
}

std::size_t mre::package::generate(std::size_t contents, std::size_t angles, size_t seed) {
    std::size_t count     = 0;
    std::size_t v_content = 0;
    while (v_content++ < contents) {
        mre::content x       = mre::content::generate(seed);
        std::size_t  v_angle = 0;
        while (v_angle++ < angles) {
            mre::content x_angle = x;
            x_angle.reset_angle_random(seed); // commenting out this line makes it work
            if (_samples.insert(x_angle).second)
                ++count;
        }
    }
    return count;
}

int main(int argc, char** argv) {
    std::cout << "Boost " << BOOST_VERSION << "n";
    if (argc < 2) {
        std::cout << "Usage: " << std::endl
                  << argv[0] << " pack FILENAME N" << std::endl
                  << argv[0] << " unpack FILENAME" << std::endl;
        return 1;
    }
    if (argv[1] == std::string("pack")) {
        auto         params = mre::parameters{.degree = 4, .frame_size = 128};
        mre::package package(params);
        std::size_t  count = package.generate(boost::lexical_cast<std::size_t>(argv[3]), 4, 0xcafebabe);
        package.save(argv[2]);
        std::cout << "serialized: " << count << " contents" << std::endl;
        return 0;
    } else if (argv[1] == std::string("unpack")) {
        mre::package package;
        package.load(argv[2]);
        if (package.loaded()) {
            std::cout << "Package loaded: " << package.size() << std::endl << package.params() << std::endl;
            package.save("roundtrip");
            return 0;
        }
        return 1;
    } else {
        std::cout << "Usage: " << std::endl
                  << argv[0] << " pack FILENAME N" << std::endl
                  << argv[0] << " unpack FILENAME" << std::endl;
        return 1;
    }
}

TL;DR

Basically, don’t ever use non-perfect hashes as keys. Additionally, don’t rely on determinism of the algorithm, except with published cryptographical digests.

- Joaqu237nML243pezMu241oz
- March 3, 2024 at 7:51 pm
- 0 votes
0
The immediate problem is that you’re violating the requirements on loading time with respect to so-called predicate serialization compatibility:

Given a binary predicate Pred over (T, T), and objects p and q of type Pred, we say that q is serialization-compatible with p if:

p(x,y) == q(x',y')

for every x and y of type T and x' and y' being restored copies of x and y, respectively.

As applies to ordered indices, the requirement is that value_comp() at saving time be compatible with value_comp at loading time, that is, both behave the same for equivalent pairs of elements, value_comp(x,y)being defined as pred(key(x),key(y), where pred is the less-than comparator (std::less<...> in your case) and key is the key extractor you provide for each index. As these key extractors depend on the implementation of boost::hash, and this has changed from Boost 1.74 to Boost 1.83, the requirement is violated and the loading code can’t make sense of what it’s retrieving, hence the exception.

If you want to know what’s happening inside, let’s look closer at the problem –this is only provided for the very curious, really, as it goes into the private implementation of Boost.MultiIndex. Your container is defined as:
```
using container = multi_index_container<
    mre::content,
    indexed_by<
        ordered_unique<boost::multi_index::identity<mre::content>>,
        ordered_unique<boost::multi_index::tag<tags::id>, const_mem_fun<mre::content, std::string, &mre::content::id>>,
        ordered_non_unique<boost::multi_index::tag<tags::content>, const_mem_fun<mre::content, std::size_t, &mre::content::hash>>,
        ordered_non_unique<boost::multi_index::tag<tags::angle>, const_mem_fun<mre::content, mre::content::angle_type, &mre::content::angle>>,
        ordered_non_unique<boost::multi_index::tag<tags::frozen>, const_mem_fun<mre::content, std::size_t, &mre::content::frozen_id>>
    >
>;
```
where indices #0, #1, #2 and #4 depend directly or indirectly on boost::hash and are then violating the requirements. Additionally, the serialization code for mre::comment:
```
template<class Archive>
void serialize(Archive & ar, const unsigned int version) {
    ar & boost::serialization::make_nvp("degree",     degree);
    ar & boost::serialization::make_nvp("frame_size", frame_size);
}
```
does not save any info dependent on boost::hash (which is calculated on the fly, so to say). Now, the internal serialization algorithm of Boost.MultiIndex goes like this:
- Save
  - Save the elements following the order of index of index #0
  - For non-unique indices, locate the groups of equivalent elements (same key for that index) and save their relative order within the group, using the element immediately preceding the group as an "anchor".
- Load
  - Load the elements and insert them into the container using index #0.
  - For non_unique elements: equivalent elements will we grouped together, but there’s no guarantee their relative order is preserved, so restore it using the info stored at saving time.
It’s the last step that causes the problem, and in particular within index #4 (the one based on mre::content::frozen_id). Note that your code generates many elements with the same frozen_id(), as this function only depends on _length:
```
std::size_t mre::package::generate(std::size_t contents, std::size_t angles){
    std::size_t count = 0;
    std::size_t v_content = 0;
    while(v_content++ < contents){
        mre::content x = mre::content::generate();
        std::size_t v_angle = 0;
        while(v_angle++ < angles){
            mre::content x_angle = x;
            x_angle.reset_angle_random(); // commenting out this line makes it work
            _samples.insert(x_angle);
            ++count;
        }
    }
    return count;
}
```
So, on saving time you may end up with a bunch of elements with the same frozen_id() generated with Boost 1.74, for instance:
```
10387073523272624522 15299377970727330730 15299377970727330730 15299377970727330730... 
```
that then will be loaded as a bunch of elements with different ids (because you’re using Boost 1.83):
```
840800904510971812 9310016153812194557 9310016153812194557 9310016153812194557...
```
and now the problem goes like this: when trying to restore the order of 9310016153812194557s, we use as anchor (the preceding element) the element that was saved with id 10387073523272624522, but this will have now a different id and, almost certainly, will not be the one preceding this group at loading time. If you followed me so far, this is the problem you’re encountering (the short explanation is you’re violating the requirements of the library, of course).

Now, moving forward you should pay attention to @sehe’s answer on how to do serialization without dependencies on functions without a fixed implementation. But I understand you have this code in production and you really need to load these legacy archives. You can hack your way using any of the following:

Solution 1: By the time the exception is thrown, elements are already loaded in the contaier, so you can catch and continue.

Solution 2: You can load the contents into an intermediate container without the problems on the last index we’ve described –this is wildly unspecified, mind you, but it should work. You should use it ONLY to handle legacy archives, moving forward do as @sehe says.

Live Coliru Demo
```
struct package{
    ...
    
    using container = ...;
    
    using loading_container = boost::multi_index_container<
        mre::content,
        boost::multi_index::indexed_by<
            boost::multi_index::ordered_unique<boost::multi_index::identity<mre::content>>,
            boost::multi_index::ordered_unique<boost::multi_index::tag<tags::id>, boost::multi_index::const_mem_fun<mre::content, std::string, &mre::content::id>>,
            boost::multi_index::ordered_non_unique<boost::multi_index::tag<tags::content>, boost::multi_index::const_mem_fun<mre::content, std::size_t, &mre::content::hash>>,
            boost::multi_index::ordered_non_unique<boost::multi_index::tag<tags::angle>, boost::multi_index::const_mem_fun<mre::content, mre::content::angle_type, &mre::content::angle>>,
            boost::multi_index::ordered_non_unique<null_key_extractor>
        >
    >;

    ...        

    BOOST_SERIALIZATION_SPLIT_MEMBER()

    template<class Archive>
    void save(Archive & ar, const unsigned int version) const {
        ar << boost::serialization::make_nvp("samples", _samples);
        ar << boost::serialization::make_nvp("params",  _parameters);
    }

    template<class Archive>
    void load(Archive & ar, const unsigned int version) {
        loading_container samples;
        ar >> boost::serialization::make_nvp("samples", _samples);
        _samples.clear();
        _samples.insert(samples.begin(), samples.end());    
        ar >> boost::serialization::make_nvp("params",  _parameters);
    }

  ...
};
```
Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.

Ubuntu – boost serialization incompatibility between 1.74 and 1.83 with multi index container

Answers

Fixing?

TL;DR