Expression templates are not being inlined fully_问答_开发者

I have the first version of a math library completed, and for the next step I'd like to turn to expression templates to improve the performance of the code. However, my initial results are different than I expected. I am compiling in MSVC 2010, in vanilla Release mode (and am okay with C++0x).

Apologies in advance for the large amount of code I'll be showing you, it's as minimal as I can make it while letting people look at what I'm doing. Profiling framework:

#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>

namespace math
{
    class vector; // to be determined

    std::ostream& operator<<(std::ostream& stream, const vector& vec)
    {
        for (std::size_t i = 0; i < 4; ++i)
            stream << vec[i] << " ";

        return stream;
    }
}

// test framework
typedef std::vector<math::vector> array_type[3];
typedef std::vector<math::vector> vector_type;

float generate_float()
{
    return static_cast<float>(rand());
}

math::vector generate_vector()
{
    return math::vector(generate_float(), generate_float(),
                        generate_float(), generate_float());
}

vector_type generate_source(std::size_t count)
{
    vector_type result; result.reserve(count);

    std::generate_n(std::back_inserter(result), count, generate_vector);

    return result;
}

double test(const array_type& source,
            vector_type& results, std::size_t iterations)
{
    // time
    std::clock_t begin = std::clock();
    for (std::size_t i = 0; i < iterations; ++i)
    {
        const math::vector& v0 = source[0][i];
        const math::vector& v1 = source[1][i];
        const math::vector& v2 = source[2][i];

        math::vector result(v0 + v1 + v2);
        results.push_back(result);
    }
    std::clock_t end = std::clock();

    // print time
    double elapsed = static_cast<double>(end - begin) / CLOCKS_PER_SEC;
    std::cout << "time: " << elapsed << "\n";

    return elapsed;
}

int main()
{
    // prepare tests
    const std::size_t time_count = 50; // number of times to get time count
    const std::size_t test_count = 10000000; // number of iterations in a test

    std::cout << "allocating..." << std::endl;
    std::vector<double> timeResults; timeResults.reserve(time_count);

    array_type source;
    for (std::size_t i = 0; i < 3; ++i)
        source[i] = generate_source(test_count);

    vector_type results;
    results.reserve(test_count);

    // pre tests
    std::cout << "pre-testing..." << std::endl;
    for (std::size_t i = 0; i < time_count / 10; ++i)
    {
        timeResults.push_back(test(source, results, test_count));
        results.clear();
    }
    timeResults.clear();

    // begin tests
    std::cout << "testing..." << std::endl;
    for (std::size_t i = 0; i < time_count; ++i)
    {
        timeResults.push_back(test(source, results, test_count));
        results.clear();
    }

    // can be turned into functors for non-C++0x, for testing in C++03
    double min = std::numeric_limits<double>::max();
    double max = std::numeric_limits<double>::min();
    std::for_each(timeResults.begin(), timeResults.end(),
                    [&min, &max](double x)
                    {
                        min = std::min(x, min);
                        max = std::max(x, max);
                    });

    double sum = 0; // throws out max and min results
    bool minFlag = false, maxFlag = false;
    std::for_each(timeResults.begin(), timeResults.end(),
                    [min, max, &sum, &minFlag, &maxFlag](double x)
                    {
                        if (!minFlag && x <= min)
                            minFlag = true; // skip
                        else if (!maxFlag && x >= max)
                            maxFlag = true; // skip
                        else
                            sum += x; // add
                    });

    // print results
    double average = sum / (timeResults.size() - 2);
    std::cout << "\ntotal time: " << sum << " average time: " << average
                << "\n" << "min: " << min << " max: " << max << std::endl;
}

Expression template vector:

namespace math
{
    // core expression template
    template <typename E>
    class vector_expression
    {
    public:
        template <typename std::size_t I>
        float get() const
        {
            return static_cast<const E&>(*this).get<I>();
        }

    protected:
        ~vector_expression() {} // not a public base
    };

    // vector class
    class vector : public vector_expression<vector>
    {
    public:
        vector()
        {
            data[0] = data[1] = data[2] = data[3] = 0;
        }

        vector(float x, float y, float z, float w)
        {
            data[0] = x; data[1] = y; data[2] = z; data[3] = w;
        }

        template <typename E>
        vector(const vector_expression<E>& e)
        {
            evaluate<0>(e);
        }

        template <std::size_t I>
        float get() const
        {
            return data[I];
        }

        float operator[](std::size_t index) const
        {
            return data[index];
        }

    private:
        template <std::size_t I, typename E>
        void evaluate(const vector_expression<E>& e,
                        typename std::enable_if<I < 4>::type* = nullptr)
        {
            data[I] = e.get<I>();

            evaluate<I + 1>(e);
        }

        template <std::size_t I, typename E>
        void evaluate(const vector_expression<E>& e,
                        typename std::enable_if<I >= 4>::type* = nullptr)
        {
            // done
        }

        float data[4];
    };

    template <typename E1, typename E2>
    class vector_expression_sum :
        public vector_expression<vector_expression_sum<E1, E2>>
    {
    public:
        vector_expression_sum(const vector_expression<E1>& first,
                                const vector_expression<E2>& second) :
        mFirst(first),
        mSecond(second)
        {}

        template <typename std::size_t I>
        float get() const
        {
            return mFirst.get<I>() + mSecond.get<I>();
        }

    private:
        const vector_expression<E1>& mFirst;
        const vector_expression<E2>& mSecond;
    };

    template <typename E1, typename E2>
    vector_expression_sum<E1, E2>
        operator+(const vector_expression<E1>& first,
                    const vector_expression<E2>& second)
    {
        return vector_expression_sum<E1, E2>(first, second);
    }
}

Manually inlined:

namespace math
{
    // same definition
}

// ...

double test(const array_type& source,
            vector_type& results, std::size_t iterations)
{
    // ...

    {
        // ...

        math::vector result(v0.get<0>() + v1.get<0>() + v2.get<0>(),
                            v0.get<1>() + v1.get<1>() + v2.get<1>(),
                            v0.get<2>() + v1.get<2>() + v2.get<2>(),
                            v0.get<3>() + v1.get<3>() + v2.get<3>());

        // ...
    }

    // ...
}

// ...

Results:

Expression templates:
total time: 14.172 average time: 0.29525
min: 0.281 max: 0.422

Manually inlined:
total time: 8.438 average time: 0.175792
min: 0.171 max: 0.188

As you can see, the expression templates (apparently) aren't turning into the fully inlined code. Here's the disassembly of test(), to last call to std::clock():

Expression templates assembly:

test:
00401110  push        ebp
00401111  mov         ebp,esp
00401113  sub         esp,38h
00401116  mov         eax,dword ptr [___security_cookie (404018h)]
0040111B  xor         eax,ebp
0040111D  mov         dword ptr [ebp-4],eax
00401120  push        ebx
00401121  push        esi
00401122  mov         esi,ecx
00401124  mov         dword ptr [ebp-28h],esi
00401127  call        dword ptr [__imp__clock (4030DCh)]
0040112D  xor         ebx,ebx
0040112F  mov         dword ptr [ebp-1Ch],eax
00401132  mov         dword ptr [ebp-24h],ebx
00401135  jmp         test+2Ah (40113Ah)
00401137  mov         esi,dword ptr [ebp-28h]
0040113A  mov         eax,dword ptr [esi+20h]
0040113D  mov         edx,dword ptr [esi+10h]
00401140  mov         ecx,dword ptr [esi]
00401142  add         eax,ebx
00401144  mov         dword ptr [ebp-18h],eax
00401147  add         edx,ebx
00401149  add         ecx,ebx
0040114B  lea         eax,[ebp-30h]
0040114E  call        math::operator+<math::vector,math::vector> (401E60h)
00401153  mov         edx,dword ptr [ebp-18h]
00401156  mov         ecx,eax
00401158  lea         eax,[ebp-38h]
0040115B  call        math::operator+<math::vector,math::vector> (401E60h)
00401160  mov         ecx,dword ptr [eax]
00401162  mov         edx,dword ptr [ecx+4]
00401165  fld         dword ptr [edx]
00401167  mov         edx,dword ptr [ecx]
00401169  fadd        dword ptr [edx]
0040116B  mov         eax,dword ptr [eax+4]
0040116E  mov         edx,dword ptr [ecx+4]
00401171  fstp        dword ptr [ebp-18h]
00401174  fld         dword ptr [ebp-18h]
00401177  fadd        dword ptr [eax]
00401179  fstp        dword ptr [ebp-14h]
0040117C  fld         dword ptr [edx+4]
0040117F  mov         edx,dword ptr [ecx]
00401181  fadd        dword ptr [edx+4]
00401184  mov         edx,dword ptr [ecx+4]
00401187  fstp        dword ptr [ebp-18h]
0040118A  fld         dword ptr [ebp-18h]
0040118D  fadd        dword ptr [eax+4]
00401190  fstp        dword ptr [ebp-10h]
00401193  fld         dword ptr [edx+8]
00401196  mov         edx,dword ptr [ecx]
00401198  fadd        dword ptr [edx+8]
0040119B  mov         edx,dword ptr [ecx+4]
0040119E  mov         ecx,dword ptr [ecx]
004011A0  fstp        dword ptr [ebp-18h]
004011A3  fld         dword ptr [ebp-18h]
004011A6  fadd        dword ptr [eax+8]
004011A9  fstp        dword ptr [ebp-0Ch]
004011AC  fld         dword ptr [edx+0Ch]
004011AF  lea         edx,[ebp-14h]
004011B2  fadd        dword ptr [ecx+0Ch]
004011B5  fstp        dword ptr [ebp-18h]
004011B8  fld         dword ptr [ebp-18h]
004011BB  fadd        dword ptr [eax+0Ch]
004011BE  mov         eax,dword ptr [edi+4]
004011C1  fstp        dword ptr [ebp-8]
004011C4  cmp         edx,eax
004011C6  jae         test+12Ch (40123Ch)
004011C8  mov         edx,dword ptr [edi]
004011CA  lea         ecx,[ebp-14h]
004011CD  cmp         edx,ecx
004011CF  ja          test+12Ch (40123Ch)
004011D1  mov         esi,ecx
004011D3  mov         ecx,dword ptr [edi+8]
004011D6  sub         esi,edx
004011D8  cmp         eax,ecx
004011DA  jne         test+10Bh (40121Bh)
004011DC  sub         eax,edx
004011DE  sar         eax,4
004011E1  cmp         eax,0FFFFFFEh
004011E6  ja          test+201h (401311h)
004011EC  sub         ecx,edx
004011EE  inc         eax
004011EF  sar         ecx,4
004011F2  cmp         eax,ecx
004011F4  jbe         test+10Bh (40121Bh)
004011F6  mov         edx,ecx
004011F8  shr         edx,1
004011FA  mov         ebx,0FFFFFFFh
004011FF  sub         ebx,edx
00401201  cmp         ebx,ecx
00401203  jae         test+0F9h (401209h)
00401205  xor         ecx,ecx
00401207  jmp         test+0FBh (40120Bh)
00401209  add         ecx,edx
0040120B  cmp         ecx,eax
0040120D  jae         test+101h (401211h)
0040120F  mov         ecx,eax
00401211  mov         edx,edi
00401213  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401930h)
00401218  mov         ebx,dword ptr [ebp-24h]
0040121B  mov         eax,dword ptr [edi+4]
0040121E  and         esi,0FFFFFFF0h
00401221  add         esi,dword ptr [edi]
00401223  test        eax,eax
00401225  je          test+18Fh (40129Fh)
00401227  mov         edx,dword ptr [esi]
00401229  mov         dword ptr [eax],edx
0040122B  mov         ecx,dword ptr [esi+4]
0040122E  mov         dword ptr [eax+4],ecx
00401231  mov         edx,dword ptr [esi+8]
00401234  mov         dword ptr [eax+8],edx
00401237  mov         ecx,dword ptr [esi+0Ch]
0040123A  jmp         test+18Ch (40129Ch)
0040123C  mov         ecx,dword ptr [edi+8]
0040123F  cmp         eax,ecx
00401241  jne         test+171h (401281h)
00401243  mov         edx,dword ptr [edi]
00401245  sub         eax,edx
00401247  sar         eax,4
0040124A  cmp         eax,0FFFFFFEh
0040124F  ja          test+201h (401311h)
00401255  sub         ecx,edx
00401257  inc         eax
00401258  sar         ecx,4
0040125B  cmp         eax,ecx
0040125D  jbe         test+171h (401281h)
0040125F  mov         edx,ecx
00401261  shr         edx,1
00401263  mov         esi,0FFFFFFFh
00401268  sub         esi,edx
0040126A  cmp         esi,ecx
0040126C  jae         test+162h (401272h)
0040126E  xor         ecx,ecx
00401270  jmp         test+164h (401274h)
00401272  add         ecx,edx
00401274  cmp         ecx,eax
00401276  jae         test+16Ah (40127Ah)
00401278  mov         ecx,eax
0040127A  mov         edx,edi
0040127C  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401930h)
00401281  mov         eax,dword ptr [edi+4]
00401284  test        eax,eax
00401286  je          test+18Fh (40129Fh)
00401288  mov         edx,dword ptr [ebp-14h]
0040128B  mov         ecx,dword ptr [ebp-10h]
0040128E  mov         dword ptr [eax],edx
00401290  mov         edx,dword ptr [ebp-0Ch]
00401293  mov         dword ptr [eax+4],ecx
00401296  mov         ecx,dword ptr [ebp-8]
00401299  mov         dword ptr [eax+8],edx
0040129C  mov         dword ptr [eax+0Ch],ecx
0040129F  add         dword ptr [edi+4],10h
004012A3  add         ebx,10h
004012A6  mov         dword ptr [ebp-24h],ebx
004012A9  cmp         ebx,9896800h
004012AF  jb          test+27h (401137h)
004012B5  call        dword ptr [__imp__clock (4030DCh)]

Manual inline assembly:

test:
004010B0  push        ebp
004010B1  mov         ebp,esp
004010B3  sub         esp,28h
004010B6  mov         eax,dword ptr [___security_cookie (404018h)]
004010BB  xor         eax,ebp
004010BD  mov         dword ptr [ebp-4],eax
004010C0  push        ebx
004010C1  push        esi
004010C2  mov         esi,ecx
004010C4  mov         dword ptr [ebp-24h],esi
004010C7  call        dword ptr [__imp__clock (4030DCh)]
004010CD  xor         ebx,ebx
004010CF  mov         dword ptr [ebp-1Ch],eax
004010D2  mov         dword ptr [ebp-18h],ebx
004010D5  mov         eax,dword ptr [esi]
004010D7  mov         ecx,dword ptr [esi+10h]
004010DA  fld         dword ptr [eax+ebx]
004010DD  fadd        dword ptr [ecx+ebx]
004010E0  mov         edx,dword ptr [esi+20h]
004010E3  add         eax,ebx
004010E5  开发者_如何学编程add         ecx,ebx
004010E7  fadd        dword ptr [edx+ebx]
004010EA  add         edx,ebx
004010EC  fstp        dword ptr [ebp-14h]
004010EF  fld         dword ptr [ecx+4]
004010F2  fadd        dword ptr [eax+4]
004010F5  fadd        dword ptr [edx+4]
004010F8  fstp        dword ptr [ebp-10h]
004010FB  fld         dword ptr [ecx+8]
004010FE  fadd        dword ptr [eax+8]
00401101  fadd        dword ptr [edx+8]
00401104  fstp        dword ptr [ebp-0Ch]
00401107  fld         dword ptr [ecx+0Ch]
0040110A  lea         ecx,[ebp-14h]
0040110D  fadd        dword ptr [eax+0Ch]
00401110  mov         eax,dword ptr [edi+4]
00401113  fadd        dword ptr [edx+0Ch]
00401116  fstp        dword ptr [ebp-8]
00401119  cmp         ecx,eax
0040111B  jae         test+0E4h (401194h)
0040111D  mov         edx,dword ptr [edi]
0040111F  cmp         edx,ecx
00401121  ja          test+0E4h (401194h)
00401123  mov         esi,ecx
00401125  mov         ecx,dword ptr [edi+8]
00401128  sub         esi,edx
0040112A  cmp         eax,ecx
0040112C  jne         test+0BDh (40116Dh)
0040112E  sub         eax,edx
00401130  sar         eax,4
00401133  cmp         eax,0FFFFFFEh
00401138  ja          test+1BCh (40126Ch)
0040113E  sub         ecx,edx
00401140  inc         eax
00401141  sar         ecx,4
00401144  cmp         eax,ecx
00401146  jbe         test+0BDh (40116Dh)
00401148  mov         edx,ecx
0040114A  shr         edx,1
0040114C  mov         ebx,0FFFFFFFh
00401151  sub         ebx,edx
00401153  cmp         ebx,ecx
00401155  jae         test+0ABh (40115Bh)
00401157  xor         ecx,ecx
00401159  jmp         test+0ADh (40115Dh)
0040115B  add         ecx,edx
0040115D  cmp         ecx,eax
0040115F  jae         test+0B3h (401163h)
00401161  mov         ecx,eax
00401163  mov         edx,edi
00401165  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401890h)
0040116A  mov         ebx,dword ptr [ebp-18h]
0040116D  mov         eax,dword ptr [edi+4]
00401170  and         esi,0FFFFFFF0h
00401173  add         esi,dword ptr [edi]
00401175  test        eax,eax
00401177  je          test+0DFh (40118Fh)
00401179  mov         edx,dword ptr [esi]
0040117B  mov         dword ptr [eax],edx
0040117D  mov         ecx,dword ptr [esi+4]
00401180  mov         dword ptr [eax+4],ecx
00401183  mov         edx,dword ptr [esi+8]
00401186  mov         dword ptr [eax+8],edx
00401189  mov         ecx,dword ptr [esi+0Ch]
0040118C  mov         dword ptr [eax+0Ch],ecx
0040118F  mov         esi,dword ptr [ebp-24h]
00401192  jmp         test+14Ah (4011FAh)
00401194  mov         ecx,dword ptr [edi+8]
00401197  cmp         eax,ecx
00401199  jne         test+12Ch (4011DCh)
0040119B  mov         edx,dword ptr [edi]
0040119D  sub         eax,edx
0040119F  sar         eax,4
004011A2  cmp         eax,0FFFFFFEh
004011A7  ja          test+1BCh (40126Ch)
004011AD  sub         ecx,edx
004011AF  inc         eax
004011B0  sar         ecx,4
004011B3  cmp         eax,ecx
004011B5  jbe         test+12Ch (4011DCh)
004011B7  mov         edx,ecx
004011B9  shr         edx,1
004011BB  mov         esi,0FFFFFFFh
004011C0  sub         esi,edx
004011C2  cmp         esi,ecx
004011C4  jae         test+11Ah (4011CAh)
004011C6  xor         ecx,ecx
004011C8  jmp         test+11Ch (4011CCh)
004011CA  add         ecx,edx
004011CC  cmp         ecx,eax
004011CE  jae         test+122h (4011D2h)
004011D0  mov         ecx,eax
004011D2  mov         edx,edi
004011D4  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401890h)
004011D9  mov         esi,dword ptr [ebp-24h]
004011DC  mov         eax,dword ptr [edi+4]
004011DF  test        eax,eax
004011E1  je          test+14Ah (4011FAh)
004011E3  mov         edx,dword ptr [ebp-14h]
004011E6  mov         ecx,dword ptr [ebp-10h]
004011E9  mov         dword ptr [eax],edx
004011EB  mov         edx,dword ptr [ebp-0Ch]
004011EE  mov         dword ptr [eax+4],ecx
004011F1  mov         ecx,dword ptr [ebp-8]
004011F4  mov         dword ptr [eax+8],edx
004011F7  mov         dword ptr [eax+0Ch],ecx
004011FA  add         dword ptr [edi+4],10h
004011FE  add         ebx,10h
00401201  mov         dword ptr [ebp-18h],ebx
00401204  cmp         ebx,9896800h
0040120A  jb          test+25h (4010D5h)
00401210  call        dword ptr [__imp__clock (4030DCh)]

Conclusion: For whatever reason, MSVC2010 does not inline calls to operator+. Does anybody know why this is? Even putting __forceinline (which I'd like to avoid) doesn't inline.

Update: As jdv-Jan de Vaan mentioned, when I remove the destructor:

// ~vector_expression() {} // not a public base

it inlines operator+. The strange thing is that it inlines it to different assembly, and my tests indicate this output, while performing better than my original, still doesn't reach the same status as the manually inlined version. Any ideas why that is?

00A710B0  push        ebp  
00A710B1  mov         ebp,esp  
00A710B3  sub         esp,28h  
00A710B6  mov         eax,dword ptr [___security_cookie (0A74018h)]  
00A710BB  xor         eax,ebp  
00A710BD  mov         dword ptr [ebp-4],eax  
00A710C0  push        ebx  
00A710C1  push        esi  
00A710C2  mov         esi,ecx  
00A710C4  mov         dword ptr [ebp-24h],esi  
00A710C7  call        dword ptr [__imp__clock (0A730DCh)]  
00A710CD  xor         ebx,ebx  
00A710CF  mov         dword ptr [ebp-1Ch],eax  
00A710D2  mov         dword ptr [ebp-28h],ebx  
00A710D5  mov         eax,dword ptr [esi]  
00A710D7  mov         ecx,dword ptr [esi+10h]  
00A710DA  fld         dword ptr [eax+ebx]  
00A710DD  fadd        dword ptr [ecx+ebx]  
00A710E0  mov         edx,dword ptr [esi+20h]  
00A710E3  add         eax,ebx  
00A710E5  add         ecx,ebx  
00A710E7  fstp        dword ptr [ebp-18h]  
00A710EA  add         edx,ebx  
00A710EC  fld         dword ptr [ebp-18h]  
00A710EF  fadd        dword ptr [edx]  
00A710F1  fstp        dword ptr [ebp-14h]  
00A710F4  fld         dword ptr [eax+4]  
00A710F7  fadd        dword ptr [ecx+4]  
00A710FA  fstp        dword ptr [ebp-18h]  
00A710FD  fld         dword ptr [ebp-18h]  
00A71100  fadd        dword ptr [edx+4]  
00A71103  fstp        dword ptr [ebp-10h]  
00A71106  fld         dword ptr [eax+8]  
00A71109  fadd        dword ptr [ecx+8]  
00A7110C  fstp        dword ptr [ebp-18h]  
00A7110F  fld         dword ptr [ebp-18h]  
00A71112  fadd        dword ptr [edx+8]  
00A71115  fstp        dword ptr [ebp-0Ch]  
00A71118  fld         dword ptr [eax+0Ch]  
00A7111B  mov         eax,dword ptr [edi+4]  
00A7111E  fadd        dword ptr [ecx+0Ch]  
00A71121  lea         ecx,[ebp-14h]  
00A71124  fstp        dword ptr [ebp-18h]  
00A71127  fld         dword ptr [ebp-18h]  
00A7112A  fadd        dword ptr [edx+0Ch]  
00A7112D  fstp        dword ptr [ebp-8]  
00A71130  cmp         ecx,eax  
00A71132  jae         test+0FBh (0A711ABh)  
00A71134  mov         edx,dword ptr [edi]  
00A71136  cmp         edx,ecx  
00A71138  ja          test+0FBh (0A711ABh)  
00A7113A  mov         esi,ecx  
00A7113C  mov         ecx,dword ptr [edi+8]  
00A7113F  sub         esi,edx  
00A71141  cmp         eax,ecx  
00A71143  jne         test+0D4h (0A71184h)  
00A71145  sub         eax,edx  
00A71147  sar         eax,4  
00A7114A  cmp         eax,0FFFFFFEh  
00A7114F  ja          test+1D3h (0A71283h)  
00A71155  sub         ecx,edx  
00A71157  inc         eax  
00A71158  sar         ecx,4  
00A7115B  cmp         eax,ecx  
00A7115D  jbe         test+0D4h (0A71184h)  
00A7115F  mov         edx,ecx  
00A71161  shr         edx,1  
00A71163  mov         ebx,0FFFFFFFh  
00A71168  sub         ebx,edx  
00A7116A  cmp         ebx,ecx  
00A7116C  jae         test+0C2h (0A71172h)  
00A7116E  xor         ecx,ecx  
00A71170  jmp         test+0C4h (0A71174h)  
00A71172  add         ecx,edx  
00A71174  cmp         ecx,eax  
00A71176  jae         test+0CAh (0A7117Ah)  
00A71178  mov         ecx,eax  
00A7117A  mov         edx,edi  
00A7117C  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (0A718A0h)  
00A71181  mov         ebx,dword ptr [ebp-28h]  
00A71184  mov         eax,dword ptr [edi+4]  
00A71187  and         esi,0FFFFFFF0h  
00A7118A  add         esi,dword ptr [edi]  
00A7118C  test        eax,eax  
00A7118E  je          test+0F6h (0A711A6h)  
00A71190  mov         edx,dword ptr [esi]  
00A71192  mov         dword ptr [eax],edx  
00A71194  mov         ecx,dword ptr [esi+4]  
00A71197  mov         dword ptr [eax+4],ecx  
00A7119A  mov         edx,dword ptr [esi+8]  
00A7119D  mov         dword ptr [eax+8],edx  
00A711A0  mov         ecx,dword ptr [esi+0Ch]  
00A711A3  mov         dword ptr [eax+0Ch],ecx  
00A711A6  mov         esi,dword ptr [ebp-24h]  
00A711A9  jmp         test+161h (0A71211h)  
00A711AB  mov         ecx,dword ptr [edi+8]  
00A711AE  cmp         eax,ecx  
00A711B0  jne         test+143h (0A711F3h)  
00A711B2  mov         edx,dword ptr [edi]  
00A711B4  sub         eax,edx  
00A711B6  sar         eax,4  
00A711B9  cmp         eax,0FFFFFFEh  
00A711BE  ja          test+1D3h (0A71283h)  
00A711C4  sub         ecx,edx  
00A711C6  inc         eax  
00A711C7  sar         ecx,4  
00A711CA  cmp         eax,ecx  
00A711CC  jbe         test+143h (0A711F3h)  
00A711CE  mov         edx,ecx  
00A711D0  shr         edx,1  
00A711D2  mov         esi,0FFFFFFFh  
00A711D7  sub         esi,edx  
00A711D9  cmp         esi,ecx  
00A711DB  jae         test+131h (0A711E1h)  
00A711DD  xor         ecx,ecx  
00A711DF  jmp         test+133h (0A711E3h)  
00A711E1  add         ecx,edx  
00A711E3  cmp         ecx,eax  
00A711E5  jae         test+139h (0A711E9h)  
00A711E7  mov         ecx,eax  
00A711E9  mov         edx,edi  
00A711EB  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (0A718A0h)  
00A711F0  mov         esi,dword ptr [ebp-24h]  
00A711F3  mov         eax,dword ptr [edi+4]  
00A711F6  test        eax,eax  
00A711F8  je          test+161h (0A71211h)  
00A711FA  mov         edx,dword ptr [ebp-14h]  
00A711FD  mov         ecx,dword ptr [ebp-10h]  
00A71200  mov         dword ptr [eax],edx  
00A71202  mov         edx,dword ptr [ebp-0Ch]  
00A71205  mov         dword ptr [eax+4],ecx  
00A71208  mov         ecx,dword ptr [ebp-8]  
00A7120B  mov         dword ptr [eax+8],edx  
00A7120E  mov         dword ptr [eax+0Ch],ecx  
00A71211  add         dword ptr [edi+4],10h  
00A71215  add         ebx,10h  
00A71218  mov         dword ptr [ebp-28h],ebx  
00A7121B  cmp         ebx,9896800h  
00A71221  jb          test+25h (0A710D5h)  
00A71227  call        dword ptr [__imp__clock (0A730DCh)]

I already commented earlier to this question. I was concerned about the presense of an empty user defined destructor, which could disable inlining. After some googling around, I feel more confident that this might actually be the answer.

This answer describes a situation that is eerily close to what you describe in your question. Here, a user defined destructor prevents inlining of a operator+ even if __forceinline is set. There are also useful debugging tips to be found here.

There is also a bug report in microsoft connect. I first heard about it on a discussion of the safeint library on channel9.