Compiler optimizing variables in inline assembly

Cody Rigney <codyrigney92@xxxxxxxxx> · Wed, 19 Feb 2014 14:04:59 -0500

Hi,

I'm trying to add NEON optimizations to OpenCV's LK optical flow.  See
link below.
https://github.com/Itseez/opencv/blob/2.4/modules/video/src/lkpyramid.cpp

The gcc version could vary since this is an open source project, but
the one I'm currently using is 4.8.1. The target architecture is ARMv7
w/ NEON. The processor I'm testing on is an ARM
Cortex-A15(big.LITTLE).

The problem is, in release mode (where optimizations are set) it does
not work properly. However, in debug mode, it works fine. I tracked
down a specific variable(FLT_SCALE) that was being optimized out and
made it volatile and that part worked fine after that. However, I'm
still having incorrect behavior from some other optimization.  I'm new
to inline assembly, so I thought maybe I'm doing something wrong
that's not telling the compiler that I'm using a certain variable.

Below is the code at its current state. Ignore all the comments and
volatiles(for testing this problem) everywhere. It's WIP. I removed
unnecessary functions and code so it would be easier to see. I think
the problem is in the bottom-most asm block because if I do if(false)
to skip it, I don't run into the problem. Thanks.

#include "precomp.hpp"
#include <float.h>
#include <stdio.h>
#include "lkpyramid.hpp"

namespace
{
static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
{
    using namespace cv;
    using cv::detail::deriv_type;
    int rows = src.rows, cols = src.cols, cn = src.channels(), colsn =
cols*cn, depth = src.depth();
    CV_Assert(depth == CV_8U);
    dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::calcSharrDeriv(src, dst))
        return;
#endif

    int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
    AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
    deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 =
alignPtr(trow0 + delta, 16);

    int three = 3, ten = 10;

    for( y = 0; y < rows; y++ )
    {
        const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
        const uchar* srow1 = src.ptr<uchar>(y);
        const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows >
1 ? rows-2 : 0);
        deriv_type* drow = dst.ptr<deriv_type>(y);

        // do vertical convolution
        x = 0;

#ifdef CV_NEON
        //assumes deriv_type is 16 bits
        if(sizeof(deriv_type) == 2 && colsn >= 16)
        {

            __asm__ volatile ( "vdup.16 q8, %0\n\t"
                               "vdup.8 d18, %1\n\t"
                               :
                               : "r" (three), "r" (ten)
                               : );

            for( ; x <= colsn - 8; x += 8)
            {

                __asm__ volatile ( "vld1.8 {d0}, [%0]\n\t"
                                   "vld1.8 {d1}, [%1]\n\t"
                                   "vld1.8 {d2}, [%2]\n\t"
                                   "vaddl.u8 q4, d0, d2\n\t"
                                   "vsubl.u8 q11, d2, d0\n\t"
                                   "vmul.u16 q5, q4, q8\n\t"
                                   "vmull.u8 q6, d1, d18\n\t"
                                   "vadd.u16 q10, q6, q5\n\t"
                                   "vst1.16 {q10}, [%3]\n\t"
                                   "vst1.16 {q11}, [%4]\n\t"
                                   :
                                   : "r" (srow0 + x),
                                     "r" (srow1 + x),
                                     "r" (srow2 + x),
                                     "r" (trow0 + x),
                                     "r" (trow1 + x)
                                   :
                                   );

            }
        }
#endif
        for( ; x < colsn; x++ )
        {
            int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10;
            int t1 = srow2[x] - srow0[x];
            trow0[x] = (deriv_type)t0;
            trow1[x] = (deriv_type)t1;
        }

        // make border
        int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn;
        for( int k = 0; k < cn; k++ )
        {
            trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
            trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
        }

#ifdef CV_NEON
    __asm__ volatile ( "vdup.16 q8, %0\n\t"
                       "vdup.16 q9, %1\n\t"
                       :
                       : "r" (three), "r" (ten)
                       : );
#endif

        // do horizontal convolution, interleave the results and store
them to dst
        x = 0;

#ifdef CV_NEON
        //assumes size of deriv_type is 16 bits
        if(sizeof(deriv_type) == 2 && colsn >= 16)
        {
            for( ; x <= colsn - 8; x += 8 )
            {
                __asm__ volatile (
                                  "vld1.16 {q0}, [%0]\n\t" //trow0[x + cn]
                                  "vld1.16 {q1}, [%1]\n\t" //trow0[x - cn]
                                  "vsub.i16 q5, q0, q1\n\t" //this is t0
                                  "vld1.16 {q2}, [%2]\n\t" //trow1[x + cn]
                                  "vld1.16 {q3}, [%3]\n\t" //trow1[x - cn]
                                  "vadd.i16 q6, q2, q3\n\t" //this
needs mult by 3
                                  "vld1.16 {q4}, [%4]\n\t" //trow1[x]
                                  "vmul.i16 q7, q6, q8\n\t" //this
needs to add to trow1[x]*10
                                  "vmul.i16 q10, q4, q9\n\t" //this is
trow1[x]*10
                                  "vadd.i16 q11, q7, q10\n\t" //this is t1
                                  "vswp d22, d11\n\t"
                                  "vst2.16 {q5}, [%5]\n\t" //interleave
                                  "vst2.16 {q11}, [%6]\n\t" //interleave
                                  :
                                  : "r" (trow0 + x + cn),  //0
                                    "r" (trow0 + x - cn),  //1
                                    "r" (trow1 + x + cn),  //2
                                    "r" (trow1 + x - cn),  //3
                                    "r" (trow1 + x),       //4
                                    "r" (drow + (x*2)),     //5
                                    "r" (drow + (x*2)+8)   //6
                                  :
                                  );

            }

        }
#endif

        for( ; x < colsn; x++ )
        {
            deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]);
            deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3
+ trow1[x]*10);
            drow[x*2] = t0; drow[x*2+1] = t1;
        }
    }
}

}//namespace

cv::detail::LKTrackerInvoker::LKTrackerInvoker(
                      const Mat& _prevImg, const Mat& _prevDeriv,
const Mat& _nextImg,
                      const Point2f* _prevPts, Point2f* _nextPts,
                      uchar* _status, float* _err,
                      Size _winSize, TermCriteria _criteria,
                      int _level, int _maxLevel, int _flags, float
_minEigThreshold )
{
    prevImg = &_prevImg;
    prevDeriv = &_prevDeriv;
    nextImg = &_nextImg;
    prevPts = _prevPts;
    nextPts = _nextPts;
    status = _status;
    err = _err;
    winSize = _winSize;
    criteria = _criteria;
    level = _level;
    maxLevel = _maxLevel;
    flags = _flags;
    minEigThreshold = _minEigThreshold;
}

void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
{
    Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
    const Mat& I = *prevImg;
    const Mat& J = *nextImg;
    const Mat& derivI = *prevDeriv;

    int j, cn = I.channels(), cn2 = cn*2;
    cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
    int derivDepth = DataType<deriv_type>::depth;

    Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
    Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2),
(deriv_type*)_buf + winSize.area()*cn);

    for( int ptidx = range.start; ptidx < range.end; ptidx++ )
    {
        Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
        Point2f nextPt;
        if( level == maxLevel )
        {
            if( flags & OPTFLOW_USE_INITIAL_FLOW )
                nextPt = nextPts[ptidx]*(float)(1./(1 << level));
            else
                nextPt = prevPt;
        }
        else
            nextPt = nextPts[ptidx]*2.f;
        nextPts[ptidx] = nextPt;

        Point2i iprevPt, inextPt;
        prevPt -= halfWin;
        iprevPt.x = cvFloor(prevPt.x);
        iprevPt.y = cvFloor(prevPt.y);

        if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
            iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
        {
            if( level == 0 )
            {
                if( status )
                    status[ptidx] = false;
                if( err )
                    err[ptidx] = 0;
            }
            continue;
        }

        volatile float a = prevPt.x - iprevPt.x;
        volatile float b = prevPt.y - iprevPt.y;
        volatile const int W_BITS = 14, W_BITS1 = 14;
        volatile const float FLT_SCALE = 1.f/(1 << 20); //volatile is
needed because compiler will optimize this out for NEON
        volatile int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
        volatile int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
        volatile int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
        volatile int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;

        volatile int dstep = (int)(derivI.step/derivI.elemSize1());
        volatile int stepI = (int)(I.step/I.elemSize1());
        volatile int stepJ = (int)(J.step/J.elemSize1());
        volatile float A11 = 0, A12 = 0, A22 = 0;

#ifdef CV_NEON

        volatile int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[]
= {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0};
        volatile const int shifter1 = -(W_BITS - 5); //negative so it
shifts right
        volatile const int shifter2 = -(W_BITS);

        if(sizeof(deriv_type) == 2)
        {

            __asm__ volatile ( "vdup.16 d26, %0\n\t"
                               "vdup.16 d27, %1\n\t"
                               "vdup.16 d28, %2\n\t"
                               "vdup.16 d29, %3\n\t"
                               "vdup.32 q11, %4\n\t"
                               "vdup.32 q12, %5\n\t"
                               :
                               : "r" ((short)iw00),
                                 "r" ((short)iw01),
                                 "r" ((short)iw10),
                                 "r" ((short)iw11),
                                 "r" (shifter1),
                                 "r" (shifter2)
                               : );

        }

#endif

        // extract the patch from the first image, compute covariation
matrix of derivatives
        volatile int x, y;
        for( y = 0; y < winSize.height; y++ )
        {
            volatile const uchar* src = (const uchar*)I.data + (y +
iprevPt.y)*stepI + iprevPt.x*cn;
            volatile const deriv_type* dsrc = (const
deriv_type*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;

            volatile deriv_type* Iptr = (deriv_type*)(IWinBuf.data +
y*IWinBuf.step);
            volatile deriv_type* dIptr =
(deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);

            x = 0;

#ifdef CV_NEON

        if(sizeof(deriv_type) == 2 && winSize.width*cn >= 12)
        {

            for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2,
dIptr += 4*2 )
            {

                __asm__ volatile (
                                   "vld1.8 {d0}, [%0]\n\t" //ignores
last 4 bytes
                                   "vmovl.u8 q0, d0\n\t" //expand to 16-bit
                                   "vld1.8 {d2}, [%1]\n\t"
                                   "vmovl.u8 q1, d2\n\t"

                                   "vmull.s16 q5, d0, d26\n\t"
                                   "vmull.s16 q6, d2, d27\n\t"

                                   "vld1.8 {d4}, [%2]\n\t"
                                   "vmovl.u8 q2, d4\n\t" //expand
                                   "vld1.8 {d6}, [%3]\n\t"
                                   "vmovl.u8 q3, d6\n\t"

                                   "vmull.s16 q7, d4, d28\n\t"
                                   "vmull.s16 q8, d6, d29\n\t"

                                   "vadd.i32 q5, q5, q6\n\t"
                                   "vadd.i32 q7, q7, q8\n\t"
                                   "vadd.i32 q5, q5, q7\n\t"

                                   "vld2.16 {d0, d1}, [%4]\n\t"
//evens in d0 and d2
                                   "vld2.16 {d2, d3}, [%5]\n\t"

                                   "vqrshl.s32 q5, q5, q11\n\t"

                                   "vmull.s16 q4, d0, d26\n\t" //q4 is
mult of even 1
                                   "vmull.s16 q6, d1, d26\n\t" //q6 is
mult of odd 1

                                   "vmovn.s32 d0, q5\n\t"

                                   "vmull.s16 q7, d2, d27\n\t" //q7 is
mult of even 2
                                   "vmull.s16 q8, d3, d27\n\t" //q8 is
mult of odd 2

                                   "vst1.16 {d0}, [%8]\n\t"

                                   "vld2.16 {d4, d5}, [%6]\n\t"
//evens in d4 and d6
                                   "vld2.16 {d6, d7}, [%7]\n\t"

                                   "vadd.i32 q4, q4, q7\n\t" //this
frees up q7 and q8
                                   "vadd.i32 q6, q6, q8\n\t" //q4 is
added even 1 and 2
                                                             //q6 is
added odd 1 and 2

                                   "vmull.s16 q7, d4, d28\n\t" //q7 is
mult of even 3
                                   "vmull.s16 q0, d5, d28\n\t" //q0 is
mult of odd 3
                                   "vmull.s16 q8, d6, d29\n\t" //q8 is
mult of even 4
                                   "vmull.s16 q15, d7, d29\n\t" //q15
is mult of odd 4

                                   "vadd.i32 q7, q7, q8\n\t" //q7 is
added even 3 and 4
                                   "vadd.i32 q0, q0, q15\n\t" //q0 is
added odd 3 and 4

                                   "vadd.i32 q4, q4, q7\n\t" //q4 is
added even 1,2,3,4 -- will be ixval
                                   "vadd.i32 q6, q6, q0\n\t" //q6 is
added odd 1,2,3,4 -- will be iyval

                                   "vld1.32 {q1}, [%11]\n\t"
                                   "vld1.32 {q2}, [%12]\n\t"
                                   "vld1.32 {q0}, [%10]\n\t" //get the
loads prepared

                                   "vqrshl.s32 q4, q4, q12\n\t" //q4
is descaled evens added
                                   "vqrshl.s32 q6, q6, q12\n\t" //q6
is descaled odds added

                                   //now ixval is stored in q4 and
iyval is stored in q6 and ival is in q5

                                   "vmul.s32 q7, q4, q4\n\t"
                                   "vmul.s32 q8, q4, q6\n\t"
                                   "vmul.s32 q15, q6, q6\n\t"

                                   "vadd.i32 q0, q0, q7\n\t"
                                   "vadd.i32 q1, q1, q8\n\t"
                                   "vadd.i32 q2, q2, q15\n\t"

                                   "vst1.32 {q0}, [%10]\n\t"
                                   "vst1.32 {q1}, [%11]\n\t"
                                   "vst1.32 {q2}, [%12]\n\t"

                                   "vmovn.i32 d8, q4\n\t" //bring ixval to short
                                   "vmovn.i32 d12, q6\n\t" //bring
iyval to short
                                   "vswp d9, d12\n\t" //now d8 is
ixval and d9 is iyval
                                   "vst2.16 {d8, d9}, [%9]\n\t"

                                   :
                                   : "r" (src + x), //0
                                     "r" (src + x + cn), //1
                                     "r" (src + x + stepI), //2
                                     "r" (src + x + stepI + cn),  //3
                                     "r" (dsrc),  //4
                                     "r" (dsrc + cn2),  //5
                                     "r" (dsrc + dstep),  //6
                                     "r" (dsrc + dstep + cn2),  //7
                                     "r" (Iptr + x),  //8
                                     "r" (dIptr), //9
                                     "r" (nA11), //10
                                     "r" (nA12), //11
                                     "r" (nA22) //12
                                   : );

            }

        }

#endif

            for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
            {
                int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
                                      src[x+stepI]*iw10 +
src[x+stepI+cn]*iw11, W_BITS1-5);

                int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
                                       dsrc[dstep]*iw10 +
dsrc[dstep+cn2]*iw11, W_BITS1);
                int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01
+ dsrc[dstep+1]*iw10 +
                                       dsrc[dstep+cn2+1]*iw11, W_BITS1);

                Iptr[x] = (short)ival;
                dIptr[0] = (short)ixval;
                dIptr[1] = (short)iyval;

                A11 += (float)(ixval*ixval);
                A12 += (float)(ixval*iyval);
                A22 += (float)(iyval*iyval);
            }
        }

#ifdef CV_NEON
        A11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]);
        A12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]);
        A22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]);
#endif

        A11 *= FLT_SCALE;
        A12 *= FLT_SCALE;
        A22 *= FLT_SCALE;

        volatile  float D = A11*A22 - A12*A12;
        float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
                        4.f*A12*A12))/(2*winSize.width*winSize.height);

        if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 )
            err[ptidx] = (float)minEig;

        if( minEig < minEigThreshold || D < FLT_EPSILON )
        {
            if( level == 0 && status )
                status[ptidx] = false;
            continue;
        }

        D = 1.f/D;

        nextPt -= halfWin;
        Point2f prevDelta;

        for( j = 0; j < criteria.maxCount; j++ )
        {
            inextPt.x = cvFloor(nextPt.x);
            inextPt.y = cvFloor(nextPt.y);

            if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
               inextPt.y < -winSize.height || inextPt.y >= J.rows )
            {
                if( level == 0 && status )
                    status[ptidx] = false;
                break;
            }

            a = nextPt.x - inextPt.x;
            b = nextPt.y - inextPt.y;
            iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
            iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
            iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
            float b1 = 0, b2 = 0;

            for( y = 0; y < winSize.height; y++ )
            {
                const uchar* Jptr = (const uchar*)J.data + (y +
inextPt.y)*stepJ + inextPt.x*cn;
                const deriv_type* Iptr = (const
deriv_type*)(IWinBuf.data + y*IWinBuf.step);
                const deriv_type* dIptr = (const
deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);

                x = 0;

                for( ; x < winSize.width*cn; x++, dIptr += 2 )
                {
                    int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                          Jptr[x+stepJ]*iw10 +
Jptr[x+stepJ+cn]*iw11,
                                          W_BITS1-5) - Iptr[x];
                    b1 += (float)(diff*dIptr[0]);
                    b2 += (float)(diff*dIptr[1]);
                }
            }

            b1 *= FLT_SCALE;
            b2 *= FLT_SCALE;

            Point2f delta( (float)((A12*b2 - A22*b1) * D),
                          (float)((A12*b1 - A11*b2) * D));
            //delta = -delta;

            nextPt += delta;
            nextPts[ptidx] = nextPt + halfWin;

            if( delta.ddot(delta) <= criteria.epsilon )
                break;

            if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
               std::abs(delta.y + prevDelta.y) < 0.01 )
            {
                nextPts[ptidx] -= delta*0.5f;
                break;
            }
            prevDelta = delta;
        }

        if( status[ptidx] && err && level == 0 && (flags &
CV_LKFLOW_GET_MIN_EIGENVALS) == 0 )
        {
            Point2f nextPoint = nextPts[ptidx] - halfWin;
            Point inextPoint;

            inextPoint.x = cvFloor(nextPoint.x);
            inextPoint.y = cvFloor(nextPoint.y);

            if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
                inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
            {
                if( status )
                    status[ptidx] = false;
                continue;
            }

            float aa = nextPoint.x - inextPoint.x;
            float bb = nextPoint.y - inextPoint.y;
            iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS));
            iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS));
            iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS));
            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
            float errval = 0.f;

            for( y = 0; y < winSize.height; y++ )
            {
                const uchar* Jptr = (const uchar*)J.data + (y +
inextPoint.y)*stepJ + inextPoint.x*cn;
                const deriv_type* Iptr = (const
deriv_type*)(IWinBuf.data + y*IWinBuf.step);

                for( x = 0; x < winSize.width*cn; x++ )
                {
                    int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                          Jptr[x+stepJ]*iw10 +
Jptr[x+stepJ+cn]*iw11,
                                          W_BITS1-5) - Iptr[x];
                    errval += std::abs((float)diff);
                }
            }
            err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
        }
    }
}