skip to Main Content

I am interested in calculating the nth term of higher order Fibonacci sequence in C++, using matrix exponentiation, while allowing n to be reasonably large (in the range of hundreds).

The sequences I have described are not Fibonacci sequence only but generalizations of it, of which Fibonacci is order 2, in order 3 3 previous terms are used to calculate the next term, i.e. F(n) = F(n – 1) + F(n – 2) + F(n – 3), this is usually called Tribonacci, order 4 is Tetranacci, and order 5 Pentanacci… I wish to calculate series up to 36nacci.

I know Fibonacci like sequences grow extremely fast, so I used boost/multiprecision/cpp_int.hpp to allow arbitrary length integers in C++, to let the function be able to calculate several hundredth term.

Order here means number of previous terms used to calculate the next term, Fibonacci is order 2, Tribonacci is order 3 and Tetranacci order 4, etc. Also, the series starts with n – 1 zeros and then S[n] == 1 for series S of order n, i.e. order 2 starts with [0, 1] and 3 [0, 0, 1] and 4 [0, 0, 0, 1], this specification is consistent with Wikipedia and OEIS and the logic of the sequences.

I have found the matrices that could be used to generate terms for orders 2, 3 and 4 online, and I have found the patterns of how the matrices are made, and wrote functions to generate matrices for arbitrary high orders.

import numpy as np

def onacci_matrix(n: int) -> np.ndarray:
    mat = np.zeros((n, n), dtype=np.uint64)
    mat[0, :] = 1
    for i in range(1, n):
        mat[i, i - 1] = 1

    if not n % 2:
        mat = mat.T

    return mat

The matrices for the first five orders are:

In [336]: onacci_matrix(2)
Out[336]:
array([[1, 1],
       [1, 0]], dtype=uint64)

In [337]: onacci_matrix(3)
Out[337]:
array([[1, 1, 1],
       [1, 0, 0],
       [0, 1, 0]], dtype=uint64)

In [338]: onacci_matrix(4)
Out[338]:
array([[1, 1, 0, 0],
       [1, 0, 1, 0],
       [1, 0, 0, 1],
       [1, 0, 0, 0]], dtype=uint64)

In [339]: onacci_matrix(5)
Out[339]:
array([[1, 1, 1, 1, 1],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]], dtype=uint64)

In [340]: onacci_matrix(6)
Out[340]:
array([[1, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0]], dtype=uint64)

More specifically, using zero-based indexing, the item located at row 0 column 0 of the base matrix raised to ith power is the item with index i + o – 1 of the series of order o.

from typing import List


def onacci_iterative(n: int, o: int) -> List[int]:
    series = []
    stack = [0] * (o - 1) + [1]
    for _ in range(n + 1):
        stack.append(sum(stack))
        series.append(stack.pop(0))

    return series

ONACCI6 = onacci_matrix(6)
In [342]: np.linalg.matrix_power(ONACCI6, 19)
Out[342]:
array([[233904, 117920,  59448,  29970,  15109,   7617],
       [230064, 115984,  58472,  29478,  14861,   7492],
       [222447, 112144,  56536,  28502,  14369,   7244],
       [207338, 104527,  52696,  26566,  13393,   6752],
       [177368,  89418,  45079,  22726,  11457,   5776],
       [117920,  59448,  29970,  15109,   7617,   3840]], dtype=uint64)

In [343]: onacci_iterative(24, 6)[24]
Out[343]: 233904

Because exponentiation is involved, this can be done using exponentiation by squaring and thus done in logarithmic time.

The matrices are stored as flat vectors to get maximum performance, I have written functions to inspect which elements of a and b are needed for each element of a dot b, and calculated the exact index that for the series of order o has the maximum term representable by unsigned integers of a given width:

def matmul_indices(n: int) -> dict:
    indices = {}
    for y in range(n):
        s = y * n
        for x in range(n):
            indices[s + x] = [(s + z, z * n + x) for z in range(n)]

    return indices


def onacci_limits(o: int, lim: int) -> dict:
    limits = []
    lim = 1 << lim
    for i in range(2, o + 1):
        n = 0
        while onacci(n, i)[-1] <= lim:
            n += 1

        limits.append(n - 1)

    return limits

So the following is the C++ code I have written so far:

#include <array>
#include <boost/multiprecision/cpp_int.hpp>
#include <chrono>
#include <iostream>
#include <vector>

using boost::uint64_t;
using boost::multiprecision::uint128_t;
using boost::multiprecision::uint256_t;
using boost::multiprecision::uint512_t;
using std::array;
using std::chrono::high_resolution_clock;
using std::vector;

constexpr array<int, 35> ONACCI_64 = {
    93, 75, 71, 70, 70,
    71, 72, 73, 74, 75,
    76, 77, 78, 79, 80,
    81, 82, 83, 84, 85,
    86, 87, 88, 89, 90,
    91, 92, 93, 94, 95,
    96, 97, 98, 99, 100
};

constexpr array<int, 35> ONACCI_128 = { 
    186, 148, 139, 136, 135,
    135, 136, 137, 138, 139,
    140, 141, 142, 143, 144,
    145, 146, 147, 148, 149,
    150, 151, 152, 153, 154,
    155, 156, 157, 158, 159,
    160, 161, 162, 163, 164 
};

constexpr array<int, 35> ONACCI_256 = { 
    370, 293, 274, 267, 265,
    264, 264, 265, 266, 267,
    268, 269, 270, 271, 272,
    273, 274, 275, 276, 277,
    278, 279, 280, 281, 282,
    283, 284, 285, 286, 287,
    288, 289, 290, 291, 292 
};

constexpr array<int, 35> ONACCI_512 = { 
    739, 585, 544, 529, 524,
    521, 521, 521, 522, 523,
    524, 525, 526, 527, 528,
    529, 530, 531, 532, 533,
    534, 535, 536, 537, 538,
    539, 540, 541, 542, 543,
    544, 545, 546, 547, 548 
};


uint64_t square_sum(int n) {
    double sq, cu;
    sq = n * n;
    cu = sq * n;
    return uint64_t(floor(cu / 3 + sq / 2 + double(n) / 6 - 0.5));
}


vector<int> ONACCI_MATRIX(16205);

void generate_matrix(int n) {
    uint64_t start;
    start = square_sum(n - 1);
    if (n & 1) {
        for (int i = 0; i < n * n; i++) {
            auto [q, r] = std::div(i, n);
            ONACCI_MATRIX[start + i] = not q or q - r == 1;
        }
    }
    else {
        for (int i = 0; i < n * n; i++) {
            auto [q, r] = std::div(i, n);
            ONACCI_MATRIX[start + i] = not r or r - q == 1;
        }
    }
}

template <typename T>
vector<T> square_multiply(const vector<T>& a, const vector<T>& b, int n) {
    int square = n * n;
    vector<T> c(square);
    T s;
    for (int y = 0; y < square; y += n) {
        for (int x = 0; x < n; x++) {
            s = 0;
            for (int z = 0; z < n; z++) {
                s += a[y + z] * b[x + z * n];
            }
            c[y + x] = s;
        }
    }
    return c;
}

template <typename T>
T onacci_matrix(int n, int o) {
    if (o < 2 or o > 36) {
        throw std::invalid_argument("order must be between 2 and 36");
    }
    if (n < o - 1) {
        return 0;
    }
    if (n == o - 1) {
        return 1;
    }
    int square = n * n;
    vector<T> base(square), p(square);
    uint64_t start = square_sum(n - 1);
    for (uint64_t i = start; i < start + square; i++) {
        base[i] = ONACCI_MATRIX[i];
        p[i] = ONACCI_MATRIX[i];
    }
    n -= o;
    while (n) {
        if (n & 1) {
            p = square_multiply<T>(base, p, o);
        }
        base = square_multiply<T>(base, base, o);
        n >>= 1;
    }
    return p[0];
 }

int main()
{
    for (int i = 2; i <= 36; i++) {
        generate_matrix(i);
    }
    uint128_t n = onacci_matrix<uint128_t>(139, 4);
    std::cout << n;
}

The code is incomplete, I want to use template functions to test the performance of each type, and I want the program to act like a shell, with the user entering the order of the series and the index of the desired term, the index is then used to determine the smallest bit width that is necessary to make sure there is no overflow. The order must be between 2 and 36, and the term at the index cannot be greater than the range of uint512_t.

The code doesn’t work, it compiles successfully, but it crashes. I compiled using Visual Studio 2022, with C++20 standard, in Debug x64 mode, and I got error message:

Expression: vector subscript out of range

enter image description here

I am a beginner in C++ and I have no idea what caused the bug, or if there are other bugs, or if my code is completely wrong in the first place.

How to fix the code?


I have clicked retry already, and the program just crashes again. And that line 1886 is in the vector.h file, my program has only 136 lines.

2

Answers


  1. The title ask about a high level algorithm, but at the end your actual question is about a segmentation fault which, I deduce, is localized a logical error in your code.

    A quick replacement of .operator[](i) by .at(i), shows that your code goes out-of-bounds at least once in this loop:

        vector<T> base(square), p(square);
    ...
        for (uint64_t i = start; i < start + square; i++) {
            base.at(i) = ONACCI_MATRIX[i];
            p[i] = ONACCI_MATRIX[i];
        }
    

    https://godbolt.org/z/ExYxzfhq1

    The error has a bit more information than the MSVC assert dialog:

      what():  vector::_M_range_check: __n (which is 885568) >= this->size() (which is 19321)
    

    For any non-zero value of start it seems inevitable that this will happen.

    I don’t have a fix because I have no idea what you are trying to do.
    Perhaps you mean i < start + n; I don’t know.

    IMO, this sounds like the typical logical error when making linear arrays into 2D arrays instead of using a proper 2D array library.

    Login or Signup to reply.
  2. I didn’t know what the code was really trying to accomplish.

    So when piecing it together I accidentally rewrote the stuff. Here’s my iterative any-Order fibonacci:

    namespace ranges = std::ranges;
    namespace views  = ranges::views;
    using Int   = uint64_t;
    
    template <size_t Order> static constexpr auto onacci_iterative(uint64_t n) {
        std::array<Int, Order> window{};
        window.back() = 1;
        while (true) {
            if (auto val = window.front())
                if (n-- == 0)
                    return val;
    
            auto sum = ranges::fold_left(window, Int{}, std::plus<>{});
            ranges::copy(views::drop(window, 1), window.begin());
            window.back() = sum;
        }
    
        throw std::logic_error{"unreachable"};
    }
    

    Here’s a simple demo Live Online:

     2: [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987]
     3: [1, 1, 2, 4, 7, 13, 24, 44, 81, 149, 274, 504, 927, 1705, 3136, 5768]
     4: [1, 1, 2, 4, 8, 15, 29, 56, 108, 208, 401, 773, 1490, 2872, 5536, 10671]
     5: [1, 1, 2, 4, 8, 16, 31, 61, 120, 236, 464, 912, 1793, 3525, 6930, 13624]
     6: [1, 1, 2, 4, 8, 16, 32, 63, 125, 248, 492, 976, 1936, 3840, 7617, 15109]
     7: [1, 1, 2, 4, 8, 16, 32, 64, 127, 253, 504, 1004, 2000, 3984, 7936, 15808]
     8: [1, 1, 2, 4, 8, 16, 32, 64, 128, 255, 509, 1016, 2028, 4048, 8080, 16128]
     9: [1, 1, 2, 4, 8, 16, 32, 64, 128, 256, 511, 1021, 2040, 4076, 8144, 16272]
    10: [1, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1023, 2045, 4088, 8172, 16336]
    

    Matrix Approach

    I won’t do all the matrix logic myself, instead using Boost Ublas:

    using Matrix = boost::numeric::ublas::matrix<Int>;
    

    First of all, let’s create the exponentation helper like you describe:

    template <class T> T my_pow(T b, int64_t e) {
        if (e == 1)
            return b;
        if (e % 2 == 1)
            return prod(b, my_pow(b, e - 1));
        T tmp = my_pow(b, e / 2);
        return prod(tmp, tmp);
    }
    

    Next, we need the onacci_matrix generator:

    Matrix onacci_matrix(Int order) {
        Matrix m(order, order);
        m.assign(boost::numeric::ublas::zero_matrix<Int>(order, order));
    
        for (Int i = 0; i < order; ++i)
            m(i, 0) = 1;
    
        for (Int i = 1; i < order; ++i)
            m(i-1, i) = 1;
    
        return order % 2 ? trans(m) : m;
    }
    

    To check that everything is correct, I created a benchmark comparison:

    template <size_t Order> void compare(uint64_t n) {
        auto timed = [](auto f) {
            auto start = now();
            auto r     = f();
            return std::tuple((now() - start), r);
        };
    
        auto [itime, itval] = timed([n] { return onacci_iterative<Order>(n); });
        auto [mtime, m]     = timed([n] { return my_pow(onacci_matrix(Order), n); });
        auto mval           = m(0, 0);
    
        fmt::print("order:{} {:<20} {:10L}μs {}n", Order, fmt::format("iterative({}):", n), itime / 1us, itval);
        fmt::print("order:{} {:<20} {:10L}μs {}n", Order, fmt::format("matrix^{}:", n), mtime / 1us, mval);
        fmt::print(" -- itval == mval? {}n", itval == mval);
    }
    

    And when you run that like

    compare<6>(19); // the sample from the question
    

    It prints

    order:6 iterative(19):                0μs 233904
    order:6 matrix^19:                   11μs 233904
     -- itval == mval? true
    

    This obviously matches the value from the question. Note that although at n=19 the iterative solution is obviously faster, it’s possible to check validity even without scaling the integer type:

    for (Int n = 19; n < 19e9; n *= 10) {
        compare<3>(n);
        compare<4>(n);
        compare<5>(n);
        compare<6>(n);
    }
    

    These all correctly check out, 36x "itval == mval? true":

    order:3 iterative(19):                0μs 66012
    order:3 matrix^19:                    2μs 66012
     -- itval == mval? true
    order:4 iterative(19):                0μs 147312
    order:4 matrix^19:                    1μs 147312
     -- itval == mval? true
    order:5 iterative(19):                0μs 203513
    order:5 matrix^19:                    2μs 203513
     -- itval == mval? true
    order:6 iterative(19):                0μs 233904
    order:6 matrix^19:                    7μs 233904
    // ...
    order:3 iterative(1900):              4μs 12869305375559491231
    order:3 matrix^1900:                  2μs 12869305375559491231
     -- itval == mval? true
    order:4 iterative(1900):             13μs 6881832060256714257
    order:4 matrix^1900:                  3μs 6881832060256714257
     -- itval == mval? true
    order:5 iterative(1900):             13μs 11498825699531939336
    order:5 matrix^1900:                  4μs 11498825699531939336
     -- itval == mval? true
    order:6 iterative(1900):             14μs 16271437555610019488
    order:6 matrix^1900:                  7μs 16271437555610019488
     -- itval == mval? true
    // ...
    order:3 iterative(1900000000):  3,176,897μs 7052199387128452865
    order:3 matrix^1900000000:            5μs 7052199387128452865
     -- itval == mval? true
    order:4 iterative(1900000000):  8,792,814μs 84094515927825409
    order:4 matrix^1900000000:            7μs 84094515927825409
     -- itval == mval? true
    order:5 iterative(1900000000):  8,793,596μs 4174985315637410720
    order:5 matrix^1900000000:           10μs 4174985315637410720
     -- itval == mval? true
    order:6 iterative(1900000000):  9,794,345μs 7912436391526238456
    order:6 matrix^1900000000:           13μs 7912436391526238456
     -- itval == mval? true
    

    Multiprecision

    To get the actually correct values, let’s use Boost Multiprecision. Instead of printing the entire values (which will be very large) we print the ₁₀log (basically, the number of digits).

    I benchmarked

    • Int = uint64_t: (overflows)

      order:6 iterative(190000):        1,364μs 10log:18.4885
      order:6 matrix^190000:                8μs 10log:18.4885
       -- itval == mval? true
      
      real    0m0,014s
      
    • Int = mp::int128_t: (overflows)

      order:6 iterative(190000):          803μs 10log:38.4783
      order:6 matrix^190000:               47μs 10log:38.4783
       -- itval == mval? true
      
      real    0m0,011s
      
    • Int = mp::int1024_t: (still overflows)

      order:6 iterative(190000):       29,524μs 10log:308.012
      order:6 matrix^190000:              498μs 10log:308.012
       -- itval == mval? true
      
      real    0m0,121s
      

    The only types that will likely fit the bill are arbitrary-precision integer types:

    • Int = mp::cpp_int: (never overflows, no link dependency)

      order:6 iterative(190000):    1,247,917μs 10log:56515.3
      order:6 matrix^190000:          198,435μs 10log:56515.3
       -- itval == mval? true
      
      real    0m4,039s
      
    • Int = mp::mpz_int: (can never overflow, uses GMP/MPFR)

      order:6 iterative(190000):    1,339,042μs 10log:56515.3
      order:6 matrix^190000:           81,846μs 10log:56515.3
       -- itval == mval? true
      
      real    0m4,057s
      

    Here’s the code with the mpz_int enabled: Live On Coliru

    #define NDEBUG
    #include <algorithm>
    #include <array>
    #include <chrono>
    #include <cstdint>
    #include <numeric>
    #include <ranges>
    #include <vector>
    
    using namespace std::chrono_literals;
    static constexpr auto now = std::chrono::high_resolution_clock::now;
    
    namespace ranges = std::ranges;
    namespace views  = ranges::views;
    
    template <size_t Order, typename Int> static constexpr auto onacci_iterative(uint64_t n) {
        std::array<Int, Order> window{};
        window.back() = 1;
        while (true) {
            if (Int const& val = window.front())
                if (n-- == 0)
                    return val;
    
            auto sum = ranges::fold_left(window, Int{}, std::plus<>{});
            ranges::copy(views::drop(window, 1), window.begin());
            window.back() = sum;
        }
    
        throw std::logic_error{"unreachable"};
    }
    
    #include <fmt/ranges.h>
    #include <fmt/ostream.h>
    
    #include <boost/multiprecision/cpp_int.hpp>
    #include <boost/multiprecision/gmp.hpp>
    #include <boost/numeric/ublas/io.hpp>
    #include <boost/numeric/ublas/matrix.hpp>
    
    template <class T> T my_pow(T b, int64_t e) {
        if (e == 1)
            return b;
        if (e % 2 == 1)
            return prod(b, my_pow(b, e - 1));
        T tmp = my_pow(b, e / 2);
        return prod(tmp, tmp);
    }
    
    // using Int = uint64_t;
    // using Int = boost::multiprecision::int128_t;
    // using Int = boost::multiprecision::int1024_t;
    // using Int = boost::multiprecision::cpp_int;
    using Int    = boost::multiprecision::mpz_int;
    using Matrix = boost::numeric::ublas::matrix<Int>;
    
    using Real = boost::multiprecision::mpf_float_50; // just for log10
    using boost::multiprecision::log10;
    
    template <> struct fmt::formatter<Int> : fmt::ostream_formatter { };
    template <> struct fmt::formatter<Real> : fmt::ostream_formatter { };
    
    Matrix onacci_matrix(size_t order) {
        Matrix m(order, order);
        m.assign(boost::numeric::ublas::zero_matrix<Int>(order, order));
    
        for (size_t i = 0; i < order; ++i)
            m(i, 0) = 1;
    
        for (size_t i = 1; i < order; ++i)
            m(i-1, i) = 1;
    
        return order % 2 ? trans(m) : m;
    }
    
    template <size_t Order> void compare(uint64_t n) {
        auto timed = [](auto f) {
            auto start = now();
            auto r     = f();
            return std::tuple((now() - start), r);
        };
    
        auto [itime, itval] = timed([n] { return onacci_iterative<Order, Int>(n); });
        auto [mtime, m]     = timed([n] { return my_pow(onacci_matrix(Order), n); });
        auto mval           = m(0, 0);
    
        fmt::print("order:{} {:<20} {:10L}μs 10log:{}n", Order, fmt::format("iterative({}):", n), itime / 1us, log10(Real(itval)));
        fmt::print("order:{} {:<20} {:10L}μs 10log:{}n", Order, fmt::format("matrix^{}:", n), mtime / 1us, log10(Real(mval)));
        fmt::print(" -- itval == mval? {}n", itval == mval);
    }
    
    int main() {
        std::locale::global(std::locale("en_US.UTF-8"));
    
        for (uint64_t n = 19; n < 19e5; n *= 10) {
            compare<3>(n);
            compare<4>(n);
            compare<5>(n);
            compare<6>(n);
        }
    }
    

    SUMMARY

    So while nothing above addresses your code, it may help you validating your intermediates, and perhaps getting ballpark measures on what performance to expect.

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search