开发者

Why does arm-gcc decrement/increment the stack pointer even when the stack is never accessed?

开发者 https://www.devze.com 2023-03-04 08:15 出处:网络
When compiling this program with arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp:

When compiling this program with arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
        x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
}

...this is what I get. Notice the two useless instructions marked with @<<<

_Z5crossRK19__simd128_float32_tS1_:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #80 @<<<
    vswp    d17, d16
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80 @<<<
    bx

The stack is never accessed, yet the stack pointer get decremented, then incremented by the same amount. Why?

If I modify the original code to include asm comment at the end of the prologue and the begining of the epilogue, like this:

#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
    asm volatile("# End of prologue");
    float32x4x2_t
        xxyyzz1(vzipq_f32(v1, v1)),
        xxyyzz2(vzipq_f32(v2, v2));

    float32x2_t
        xx1(vget_low_f32(xxyyzz1.val[0])),
        yy1(vget_high_f32(xxyyzz1.val[0])),
        zz1(vget_low_f32(xxyyzz1.val[1])),
        xx2(vget_low_f32(xxyyzz2.val[0])),
        yy2(vget_high_f32(xxyyzz2.val[0])),
        zz2(vget_low_f32(xxyyzz2.val[1]));

    float32x2_t
   开发者_运维技巧     x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
        y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
        z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));

    float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
    asm volatile("# Start of epilogue");
    return res;
}

Then I get slightly different version:

_Z5crossRK19__simd128_float32_tS1_:
    sub sp, sp, #80
    # End of prologue
    vldmia  r0, {d16-d17}
    vldmia  r1, {d22-d23}
    vmov    q10, q8  @ v4sf
    vmov    q9, q11  @ v4sf
    vzip.32 q8, q10
    vzip.32 q11, q9
    vmov    d24, d17
    vmov    d21, d22
    vmov    d22, d23
    vmul.f32    d17, d24, d18
    vmul.f32    d19, d20, d21
    vmls.f32    d19, d16, d18
    vmls.f32    d17, d20, d22
    vmul.f32    d16, d16, d22
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    vswp    d17, d16
    # Start of epilogue
    vmov    r0, r1, d16  @ v4sf
    vmov    r2, r3, d17
    add sp, sp, #80
    bx  lr

The stack pointer decrement/increment clearly is part of the prologue/epilogue, and happens even if the stack is not used. Is that to comply with some standard, or is it a gcc optimization bug?

EDIT: Compiler is arm-elf-gcc-4.5 (GCC) 4.5.0, configured with: /opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/configure --prefix=/opt/local --infodir=/opt/local/share/info --mandir=/opt/local/share/man --target=arm-elf --program-prefix=arm-elf- --program-suffix=-4.5 --without-included-gettext --enable-obsolete --with-newlib --disable-__cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx-include-dir=/opt/local/arm-elf/include/c++/4.5.0/ --enable-languages=c,c++,objc --build=x86_64-apple-darwin10 --enable-fpu

EDIT: I managed to pinpoint the problem using the following C source. It only happens when using arrays of vector types as temporaries, such as float32x4x2_t which is declared as struct { float32x4_t val[2]; }, even tho these temporaries are made registers. I believe this is a bug, so I reported it.

#include <arm_neon.h>

// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
    return vaddq_f32(*v1, *v2);
#if 0
    produced assembly:

add:
    vldmia  r0, {d16-d17}
    vldmia  r1, {d18-d19}
    vadd.f32    q8, q8, q9
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    bx  lr

#endif
}

// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
    float32x4x2_t
        xxyyzz1=vzipq_f32(*v1, *v1),
        xxyyzz2=vzipq_f32(*v2, *v2);

    float32x2_t
        xx1=vget_low_f32(xxyyzz1.val[0]),
        yy1=vget_high_f32(xxyyzz1.val[0]),
        zz1=vget_low_f32(xxyyzz1.val[1]),
        xx2=vget_low_f32(xxyyzz2.val[0]),
        yy2=vget_high_f32(xxyyzz2.val[0]),
        zz2=vget_low_f32(xxyyzz2.val[1]);

    float32x2_t
        x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
        y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
        z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);

    return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
    produced assembly:

cross:
    vldmia  r0, {d18-d19}
    vldmia  r1, {d16-d17}
    vmov    q10, q9
    vmov    q11, q8
    vzip.32 q9, q10
    vzip.32 q8, q11
    vmov    d24, d19
    vmov    d21, d16
    vmov    d16, d17
    vmul.f32    d19, d20, d21
    vmul.f32    d17, d24, d22
    vmls.f32    d17, d20, d16
    vmls.f32    d19, d18, d22
    vmul.f32    d16, d18, d16
    vuzp.32 d17, d19
    vmls.f32    d16, d24, d21
    sub sp, sp, #48           @ here
    vswp    d17, d16
    vmov    r0, r1, d16
    vmov    r2, r3, d17
    add sp, sp, #48           @ and here
    bx  lr

#endif
}


This turned out to be a bug, so closing it.

0

精彩评论

暂无评论...
验证码 换一张
取 消

关注公众号