skip to Main Content

Which native iOS framework is best used to eradicate this cpu hog written in OpenCV?

/// Reduce the channel elements of given Mat to a single channel
static func reduce(input: Mat) throws -> Mat {
    
    let output = Mat(rows: input.rows(), cols: input.cols(), type: CvType.CV_8UC1)
    
    for x in 0 ..< input.rows() {
        for y in 0 ..< input.cols() {
            let value = input.get(row: x, col: y)
            let dataValue = value.reduce(0, +)
            try output.put(row: x, col: y, data: [dataValue])
        }
    }
    
    return output
}

takes about 20+ seconds to do those gets and puts on real world data I put this code through.

2

Answers


  1. Chosen as BEST ANSWER

    For folks such as myself who have a poor comprehension of ARM Intrinsics a simpler solution is to bridge into Objective C code as Soonts did and thusly ditch crude Swift api to opencv bypassing costly memory copying with gets and puts.

    void fasterSumX2( const char *input,
                     int rows,
                     int columns,
                     long step,
                     int channels,
                     char* output,
                     long output_step
                     )
    {
        for(int j = 0;j < rows;j++){
            for(int i = 0;i < columns;i++){
                long offset = step * j + i * channels;
                const unsigned char *ptr = (const unsigned char *)(input + offset);
                int res = ptr[0]+ptr[1];
                if (res > 0) {
                    if (res > 255) {
                        assert(false);
                    }
                }
                *(output + output_step * j + i) = res;
            }
        }
    }
    

  2. Assuming your input matrix is CV_64FC2, call computeSumX2 C function for each row.

    Untested.

    #include <arm_neon.h>
    #include <stdint.h>
    #include <stddef.h>
    
    // Load 8 FP64 values, add pairwise, narrow uint64 to uint32, combine into a single vector
    inline uint32x4_t reduce4( const double* rsi )
    {
        // Load 8 values
        float64x2x4_t f64 = vld1q_f64_x4( rsi );
        // Add them pairwise
        float64x2_t f64_1 = vpaddq_f64( f64.val[ 0 ], f64.val[ 1 ] );
        float64x2_t f64_2 = vpaddq_f64( f64.val[ 2 ], f64.val[ 3 ] );
    
        // Convert FP64 to uint64
        uint64x2_t i64_1 = vcvtq_u64_f64( f64_1 );
        uint64x2_t i64_2 = vcvtq_u64_f64( f64_2 );
    
        // Convert int64 to int32 in a single vector, using saturation
        uint32x2_t low = vqmovn_u64( i64_1 );
        return vqmovn_high_u64( low, i64_2 );
    }
    
    // Compute pairwise sum of FP64 values, cast to bytes
    void computeSumX2( uint8_t* rdi, size_t length, const double* rsi )
    {
        const double* const rsiEnd = rsi + length * 2;
        size_t lengthAligned = ( length / 16 ) * 16;
        const double* const rsiEndAligned = rsi + lengthAligned * 2;
    
        for( ; rsi < rsiEndAligned; rsi += 16 * 2, rdi += 16 )
        {
            // Each iteration of the loop loads 32 source values, stores 16 bytes
            uint16x4_t low16 = vqmovn_u32( reduce4( rsi ) );
            uint16x8_t u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 ) );
            uint8x8_t low8 = vqmovn_u16( u16 );
    
            low16 = vqmovn_u32( reduce4( rsi + 8 * 2 ) );
            u16 = vqmovn_high_u32( low16, reduce4( rsi + 8 * 3 ) );
            uint8x16_t res = vqmovn_high_u16( low8, u16 );
    
            vst1q_u8( rdi, res );
        }
    
        for( ; rsi < rsiEnd; rsi += 2, rdi++ )
        {
            // Each iteration of the loop loads 2 source values, stores a single byte
            float64x2_t f64 = vld1q_f64( rsi );
            double sum = vaddvq_f64( f64 );
            *rdi = (uint8_t)sum;
        }
    }
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search