What are my options to convert OpenCV reduce loop to a native iOS code. SIMD anyone? - Ios swift

AntonTropashko
September 20, 2022
267 views
0 votes
2 Answers

Which native iOS framework is best used to eradicate this cpu hog written in OpenCV?

/// Reduce the channel elements of given Mat to a single channel
static func reduce(input: Mat) throws -> Mat {
    
    let output = Mat(rows: input.rows(), cols: input.cols(), type: CvType.CV_8UC1)
    
    for x in 0 ..< input.rows() {
        for y in 0 ..< input.cols() {
            let value = input.get(row: x, col: y)
            let dataValue = value.reduce(0, +)
            try output.put(row: x, col: y, data: [dataValue])
        }
    }
    
    return output
}

takes about 20+ seconds to do those gets and puts on real world data I put this code through.

Answers

Chosen as BEST ANSWER

For folks such as myself who have a poor comprehension of ARM Intrinsics a simpler solution is to bridge into Objective C code as Soonts did and thusly ditch crude Swift api to opencv bypassing costly memory copying with gets and puts.

void fasterSumX2( const char *input,
                 int rows,
                 int columns,
                 long step,
                 int channels,
                 char* output,
                 long output_step
                 )
{
    for(int j = 0;j < rows;j++){
        for(int i = 0;i < columns;i++){
            long offset = step * j + i * channels;
            const unsigned char *ptr = (const unsigned char *)(input + offset);
            int res = ptr[0]+ptr[1];
            if (res > 0) {
                if (res > 255) {
                    assert(false);
                }
            }
            *(output + output_step * j + i) = res;
        }
    }
}

(Edit)

Assuming your input matrix is CV_64FC2, call computeSumX2 C function for each row.

Untested.

#include <arm_neon.h>
#include <stdint.h>
#include <stddef.h>

// Load 8 FP64 values, add pairwise, narrow uint64 to uint32, combine into a single vector
inline uint32x4_t reduce4( const double* rsi )
{
    // Load 8 values
    float64x2x4_t f64 = vld1q_f64_x4( rsi );
    // Add them pairwise
    float64x2_t f64_1 = vpaddq_f64( f64.val[ 0 ], f64.val[ 1 ] );
    float64x2_t f64_2 = vpaddq_f64( f64.val[ 2 ], f64.val[ 3 ] );

    // Convert FP64 to uint64
    uint64x2_t i64_1 = vcvtq_u64_f64( f64_1 );
    uint64x2_t i64_2 = vcvtq_u64_f64( f64_2 );

    // Convert int64 to int32 in a single vector, using saturation
    uint32x2_t low = vqmovn_u64( i64_1 );
    return vqmovn_high_u64( low, i64_2 );
}

// Compute pairwise sum of FP64 values, cast to bytes
void computeSumX2( uint8_t* rdi, size_t length, const double* rsi )
{
    const double* const rsiEnd = rsi + length * 2;
    size_t lengthAligned = ( length / 16 ) * 16;
    const double* const rsiEndAligned = rsi + lengthAligned * 2;

    for( ; rsi < rsiEndAligned; rsi += 16 * 2, rdi += 16 )
    {
        // Each iteration of the loop loads 32 source values, stores 16 bytes
        uint16x4_t low16 = vqmovn_u32( reduce4( rsi ) );
        uint16x8_t u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 ) );
        uint8x8_t low8 = vqmovn_u16( u16 );

        low16 = vqmovn_u32( reduce4( rsi + 8 * 2 ) );
        u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 * 3 ) );
        uint8x16_t res = vqmovn_high_u16( low8, u16 );

        vst1q_u8( rdi, res );
    }

    for( ; rsi < rsiEnd; rsi += 2, rdi++ )
    {
        // Each iteration of the loop loads 2 source values, stores a single byte
        float64x2_t f64 = vld1q_f64( rsi );
        double sum = vaddvq_f64( f64 );
        *rdi = (uint8_t)sum;
    }
}

Please signup or login to give your own answer.

Click here to cancel reply.

What are my options to convert OpenCV reduce loop to a native iOS code. SIMD anyone? – Ios swift

Answers