在 boost multi_index_container 中查找元素

问题描述

在我的代码中,我需要有一个功能来遍历所有元素并尽快检查是否已经存在某个元素,所以我的选择落在了 boost 多索引容器上,我可以在其中使用 vector 和 unordered_set 接口作为我的同时类动物。问题是我无法通过 unordered_set 接口找到某些元素,因为我将 key 从 std::string 替换为 std::array 并调整了代码,我不知道我做错了什么?

代码https://wandbox.org/permlink/dnCaEzYVdXkTFBGo

#include <array>
#include <algorithm>
#include <iostream>
#include <chrono>
#include <string>
#include <vector>
#include <list>
#include <map>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <memory>

#include <boost/multi_index_container.hpp>
#include <boost/multi_index/ordered_index.hpp>
#include <boost/multi_index/composite_key.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/sequenced_index.hpp>
#include <boost/multi_index/random_access_index.hpp>
#include <boost/multi_index/member.hpp>
#include <boost/multi_index/identity.hpp>

int constexpr elements_size{ 1'000'000 };

struct Animal
{
Animal(std::string name,std::string description,int leg,int age,double maxSpeed) noexcept :
    description_{std::move(description)},leg_{leg},age_{age},maxSpeed_{maxSpeed}
{
    std::copy(name.begin(),name.end(),name_.data());
}

Animal(std::string const& name,std::string const& description) noexcept :
    description_{description}
{
    std::copy(name.begin(),name_.data());
}

Animal(Animal&& animal) noexcept
{
    name_ = name_;
    description_ = std::move(animal).description_;
    leg_ = animal.leg_;
    age_ = animal.age_;
    maxSpeed_ = animal.maxSpeed_;
}

Animal(Animal const& animal) noexcept
{
    name_ = animal.name_;
    description_ = animal.description_;
    leg_ = animal.leg_;
    age_ = animal.age_;
    maxSpeed_ = animal.maxSpeed_;
}

Animal& operator=(Animal&& animal) noexcept
{
    name_ = name_;
    description_ = std::move(animal).description_;
    leg_ = animal.leg_;
    age_ = animal.age_;
    maxSpeed_ = animal.maxSpeed_;
    return *this;
}

Animal& operator=(Animal const& animal) noexcept
{
    name_ = animal.name_;
    description_ = animal.description_;
    leg_ = animal.leg_;
    age_ = animal.age_;
    maxSpeed_ = animal.maxSpeed_;
    return *this;
}

std::array<char,50> name_;
std::string description_;
int leg_{0};
int age_{0};
double maxSpeed_{0.0};
};

struct Hasher
{
bool print_;
Hasher(bool print = false): print_{print} {}
std::size_t operator()(std::array<char,50> const& name) const
{
    if (print_)
        std::cout << "array hash" << std::hash<std::string_view>{}(name.data()) << std::endl;
    return std::hash<std::string_view>{}(name.data());
}
std::size_t operator()(std::string const& name) const
{
    if (print_)
        std::cout << "string hash" << std::hash<std::string_view>{}(name.c_str()) << std::endl;
    return std::hash<std::string_view>{}(name.c_str());
}
std::size_t operator()(const char* name) const
{
    if (print_)
        std::cout << "char hash" << std::hash<std::string_view>{}(name) << std::endl;
    return std::hash<std::string_view>{}(name);
}
};

struct KeysComparator
{
bool operator()(std::array<char,50> const& a1,std::array<char,50> const& a2) const {return a1 == a2; }
template <typename T>
bool operator()(std::string const& n1,T const& t) const
{
    std::cout << "### value.name_" << t.value.name_.data() << ",n1: " << n1 << std::endl;
    return n1 == t.value.name_.data();
}
};

template<typename TimePoint>
std::string getelapsedtime(TimePoint const& start,TimePoint const& end)
{
auto micro = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
auto milli = std::chrono::duration_cast<std::chrono::milliseconds>(micro);
auto sec = std::chrono::duration_cast<std::chrono::seconds>(milli);

return {std::to_string(micro.count()) + " µs," + std::to_string(milli.count()) + " ms," + std::to_string(sec.count()) + " s"};
}

template<typename TimePoint>
void printStatistics(TimePoint const& emplace_start,TimePoint const& emplace_end,TimePoint const& iterate_start,TimePoint const& iterate_end,TimePoint const& find_start,TimePoint const& find_end,intmax_t const sum,std::string target)
{
std::cout << "Elapsed time emplace: "  << getelapsedtime(emplace_start,emplace_end)
    << "  |  iterate: " << getelapsedtime(iterate_start,iterate_end)
    << "  |  find: " << getelapsedtime(find_start,find_end)
    << ",sum:" << sum << ",calculation for " << target << std::endl;
}

void test()
{
using namespace boost::multi_index;
using Animal_multi = multi_index_container<Animal,indexed_by<
    random_access<>,hashed_unique<
        composite_key<Animal,member<Animal,50>,&Animal::name_>>,composite_key_hash<Hasher>,composite_key_equal_to<KeysComparator>>
>>;

Animal_multi container;
auto emplace_start = std::chrono::steady_clock::Now();
for (auto i = 0; i < elements_size; ++i)
    container.emplace_back("the really long name of some animal 12345678910_" + std::to_string(i),"bla bla bla bla bla bla bla bla bla bla bla bla bla",4,i,i + 2);
auto emplace_end = std::chrono::steady_clock::Now();

intmax_t sum{0};
auto iterate_start = std::chrono::steady_clock::Now();
for (auto const& e : container)
    sum += e.age_;
auto iterate_end = std::chrono::steady_clock::Now();

KeysComparator key_comparator;
Hasher hasher{true};
auto find_start = std::chrono::steady_clock::Now();
auto &container_interface = container.get<1>();
auto isSucceeded = container_interface.count("the really long name of some animal 12345678910_" + std::to_string(elements_size-1),hasher,key_comparator);
if (not isSucceeded)
    std::cout << "WARN: Element has not been found." << std::endl;
auto find_end = std::chrono::steady_clock::Now();

printStatistics(emplace_start,emplace_end,iterate_start,iterate_end,find_start,find_end,sum,"Animal_multi (boost multi_index)");
}

int main()
{
    test();
    return 0;
}

解决方法

移动构造函数中有许多错误:

name_ = name_; // oops this does nothing at all

只需关注Rule Of Zero。这也会通知您std::string 副本/分配不是 noexcept

名称副本应该有长度限制:

std::copy_n(name.begin(),std::min(name.size(),name_.size()),name_.data());

此时我注意到一些可以解释您的问题的东西:您没有以 NUL 终止,也没有确保数组是 0 初始化的。

宾果游戏

确实,我发现只有几行:

return std::hash<std::string_view>{}(name.data());

那是... UB!您的 string_view 可能包含不确定的数据,但更糟糕的是,您永远不会复制终止 NUL 字符。因此,std::string_view 将对长度不确定的字符串建模,该字符串可能会超过 50。

在此处阅读有关 Nasal Demons (UB)

这就是为旧的 C 工艺跳过标准库类型的危险。

第一次挖掘

因此,这是具有相同/更好特征的整个班级:

using Name = std::array<char,50>;

struct Animal {
    Animal(std::string_view name,std::string description,int leg = 0,int age = 0,double maxSpeed = 0) noexcept 
        : name_{0},// zero initialize!
          description_{std::move(description)},leg_{leg},age_{age},maxSpeed_{maxSpeed}
    {
        constexpr auto Capacity = std::tuple_size<Name>::value;
        constexpr auto MaxLen = Capacity - 1; // reserve NUL char

        assert(name.length() < MaxLen);
        std::copy_n(name.data(),std::min(name.length(),MaxLen),name_.data());
    }

    //Animal            ( Animal&&      animal  ) noexcept = default;
    //Animal            ( Animal const& animal  )          = default;
    //Animal& operator= ( Animal&&      animal  ) noexcept = default;
    //Animal& operator= ( Animal const& animal  )          = default;

    Name        name_;
    std::string description_;
    int         leg_{0};
    int         age_{0};
    double      maxSpeed_{0.0};
};

改进:FixedString

这只是尖叫更好的Name类型。怎么样,FixedString

template <size_t N> struct FixedString {
    static_assert(N > 1); // require space for NUL char

    FixedString(std::string_view s) : data_{0} {
        if (s.length() >= N)
            throw std::length_error("FixedString");
        std::copy_n(s.data(),std::min(s.length(),N - 1),data());
    }

    std::string_view str() const { return { data(),size() }; }
    operator std::string_view() const { return str(); }

    auto data()   const { return data_.data(); }
    auto data()         { return data_.data(); }
    auto c_str()  const { return data_.data(); }
    auto c_str()        { return data_.data(); }

    auto begin() const { return data_.begin(); }
    auto end()   const { return data_.end();   }
    auto begin()       { return data_.begin(); }
    auto end()         { return data_.end();   }

    size_t size() const {
        auto terminator = std::memchr(data(),data_.max_size());
        return terminator
            ? static_cast<char const*>(terminator) - data()
            : data_.max_size();
    };

    bool operator<(FixedString const& rhs)       const { return str() <  rhs.str(); }
    bool operator==(FixedString const& rhs)      const { return str() == rhs.str(); }
    bool operator!=(FixedString const& rhs)      const { return str() != rhs.str(); }
    // optimizations:
    bool operator<(std::string_view const& rhs)  const { return str() <  rhs.substr(0,N-1); }
    bool operator==(std::string_view const& rhs) const { return str() == rhs.substr(0,N-1); }
    bool operator!=(std::string_view const& rhs) const { return str() != rhs.substr(0,N-1); }

  private:
    std::array<char,N> data_;
};

现在你可以简单地

using Name = FixedString<50>;

并且您所有的 Name 都将神奇地(并且安全地)在字符串视图之间进行转换。

using Name = FixedString<50>;

struct Animal {
    Animal(std::string_view name,double maxSpeed = 0) noexcept 
        : name_{name},description_{std::move(description)},maxSpeed_{maxSpeed}
    { }

    Name        name_;
    std::string description_;
    int         leg_{0};
    int         age_{0};
    double      maxSpeed_{0.0};
};

一切都通过正确的抽象来简化

这是我认为我在我的编程生涯中学到的最重要的一课:选择正确的抽象会导致简单。在这里,我们蒸发了两个杂乱的助手:

using Hasher         = std::hash<std::string_view>;
using KeysComparator = std::equal_to<Name>;

轰隆隆。他们会做你拥有的一切,但更好。

现在,缺失的元素

在将整个内容简化为 this 之后,很明显std::array<char,50> 可以永远正确包含长度超过 50 个字符的名称。事实上,检查插入:

auto emplace_start = Now();

size_t duplicates = 0;
for (auto i = 0; i < elements_size; ++i) {
    auto [_,ok] = container.emplace_back(
        make_name(i),"bla bla bla bla bla bla bla bla bla bla bla bla bla",4,i,i + 2);

    if (!ok) ++duplicates;
}
if (duplicates) {
    std::cerr << "Oops," << duplicates << " duplicate keys not inserted\n";
}

auto emplace_end = Now();

揭示:

Oops,999990 duplicate keys not inserted
Elapsed time emplace: 116.491ms iterate: 0.000145ms find: 0.000597ms,sum:45,calculation for Animal_multi (boost multi_index)

至少,现在您将 Undefined Behaviour 替换为 约束检查。

当然,只需增加名称容量即可解决:[https://wandbox.org/permlink/6AamJfXe76nYALfR)

using Name = FixedString<60>;

打印:

Elapsed time emplace: 594.475ms iterate: 18.6076ms find: 0.003138ms,sum:499999500000,calculation for Animal_multi (boost multi_index)

或者,您可以使用过长的名称添加 Name 结构:Live On Wandbox

FixedString(std::string_view s) : data_{0} {
    if (s.length() >= N)
        throw std::length_error("FixedString");
    std::copy_n(s.data(),data());
}

正式印刷

terminate called after throwing an instance of 'std::length_error'
  what():  FixedString

Full Listing

此演示使用 FixedString<60> 来避免关键错误:

#include <boost/multi_index_container.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/random_access_index.hpp>
#include <boost/multi_index/member.hpp>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std::chrono_literals;

int constexpr elements_size{ 1'000'000 };

template <size_t N> struct FixedString {
    static_assert(N > 1); // require space for NUL char

    FixedString(std::string_view s) : data_{0} {
        if (s.length() >= N)
            throw std::length_error("FixedString");
        std::copy_n(s.data(),data_.max_size());
        return terminator
            ? static_cast<char const*>(terminator) - data()
            : data_.max_size();
    };

    bool operator<(std::string_view const& rhs)  const { return str() <  rhs.substr(0,N-1); }
    bool operator<(FixedString const& rhs)       const { return str() <  rhs.str(); }
    bool operator==(FixedString const& rhs)      const { return str() == rhs.str(); }
    bool operator!=(FixedString const& rhs)      const { return str() != rhs.str(); }

  private:
    std::array<char,N> data_;
};

using Name = FixedString<60>;

struct Animal {
    Animal(std::string_view name,maxSpeed_{maxSpeed}
    { }

    Name        name_;
    std::string description_;
    int         leg_{0};
    int         age_{0};
    double      maxSpeed_{0.0};
};

using Hasher         = std::hash<std::string_view>;
using KeysComparator = std::equal_to<Name>;

using Clock     = std::chrono::steady_clock;
using Duration  = Clock::duration;
static auto Now = Clock::now;

void printStatistics(Duration emplace,Duration iterate,Duration find,intmax_t const sum,std::string target)
{
    std::cout << "Elapsed time"
        << " emplace: " << (emplace/1.0ms) << "ms"
        << " iterate: " << (iterate/1.0ms) << "ms"
        << " find: " << (find/1.0ms) << "ms"
        << ",sum:" << sum
        << ",calculation for " << target
        << std::endl;
}

void test() {
    namespace bmi = boost::multi_index;
    using Animal_multi = bmi::multi_index_container<Animal,bmi::indexed_by<
        bmi::random_access<>,bmi::hashed_unique<
            bmi::tag<struct by_name>,bmi::member<Animal,Name,&Animal::name_>,Hasher,KeysComparator>
        >
    >;

    Animal_multi container;

    auto make_name = [](size_t id) {
        return "the really long name of some animal 12345678910_" + std::to_string(id);
    };

    auto emplace_start = Now();

    size_t duplicates = 0;
    for (auto i = 0; i < elements_size; ++i) {
        auto [_,ok] = container.emplace_back(
            make_name(i),i + 2);

        if (!ok) ++duplicates;
    }
    if (duplicates) {
        std::cerr << "Oops," << duplicates << " duplicate keys not inserted\n";
    }

    auto emplace_end = Now();

    intmax_t sum{ 0 };
    auto iterate_start = Now();
    for (auto const& e : container) {
        sum += e.age_;
    }
    auto iterate_end = Now();

    auto find_start = Now();
    {
        auto& name_idx = container.get<by_name>();
        auto last_key = make_name(elements_size - 1);
        if (name_idx.count(std::string_view(last_key)) == 0u) {
            std::cout << "WARN: Element has not been found." << std::endl;
        }
    }
    auto find_end = Now();

    printStatistics(
            emplace_end - emplace_start,iterate_end - iterate_start,find_end - find_start,sum,"Animal_multi (boost multi_index)");
}

int main() { test(); }