How to optimize this code?_问答_开发者_运维开发者技术经验分享

Profiler says that 50% of total time spends inside this function. How would you optimize it? It converts BMP color scheme to YUV. Thanks!

Update: platform is ARMV6 (writing for IPhone)

#define Y_FROM_RGB(_r_,_g_,_b_) ( (  66 * _b_ + 129 * _g_ +  25 * _r_ + 128) >> 8) + 16
#define V_FROM_RGB(_r_,_g_,_b_) ( ( 112 * _b_ -  94 * _g_ -  18 * _r_ + 128) >> 10) + 128
#define U_FROM_RGB(_r_,_g_,_b_) ( ( -38 * _b_ -  74 * _g_ + 112 * _r_ + 128) >> 10) + 128

  /*!
 * \brief
 * Converts 24 bit image to YCrCb image channels
 * 
 * \param source
 * Source 24bit image pointer
 * 
 * \param source_width
 * Source image width
 * 
 * \param dest_Y
 * destination image Y component pointer
 * 
 * \param dest_scan_size_Y
 * destination image Y component line size
 * 
 * \param dest_U
 * destination image U component pointer
 * 
 * \param dest_scan_size_U
 * destination image U component line size
 * 
 * \param dest_V
 * destination image V component pointer
 * 
 * \param dest_scan_size_V
 * destination image V component line size
 * 
 * \param dest_width
 * Destination image width = source_width
 * 
 * \param dest_height
 * Destination image height = source image height
 *
 * Convert 24 bit image (source) with width (source_width)
 * to YCrCb image channels (dest_Y, dest_U, dest_V) with size (dest_width)x(dest_height), and line size
 * (dest_scan_size_Y, dest_scan_size_U, dest_scan_size_V) (in bytes)
 * 
 */
void ImageConvert_24_YUV420P(unsigned char * source, int source_width,
                            unsigned char * dest_Y, int dest_scan_size_Y,
                            unsigned char * dest_U, int dest_scan_size_U,
                            unsigned char * dest_V, int dest_scan_size_V,
                            int dest_width, int dest_height)
{
  int source_scan_size = source_width*3;

  int half_width = dest_width/2;

  //Y loop
  for (int y = 0; y < dest_height/2; y ++)
  {
    //Start of line
    unsigned char * source_scan = source;
    unsigned char * source_scan_next = source+source_scan_size;
    unsigned char * dest_scan_Y = dest_Y;
    unsigned char * dest_scan_U = dest_U;
    unsigned char * dest_scan_V = dest_V;

    //Do all pixels
    for (int x = 0; x < half_width; x++)
    {
      int 开发者_开发百科R = source_scan[0];
      int G = source_scan[1];
      int B = source_scan[2];

      //Y
      int Y = Y_FROM_RGB(B, G, R);

      *dest_scan_Y = Y;
      source_scan += 3;
      dest_scan_Y += 1;

      int R1 = source_scan[0];
      int G1 = source_scan[1];
      int B1 = source_scan[2];

      //Y
      Y = Y_FROM_RGB(B1, G1, R1);

      R += (R1 + source_scan_next[0] + source_scan_next[3]);
      G += (G1 + source_scan_next[1] + source_scan_next[4]);
      B += (B1 + source_scan_next[2] + source_scan_next[5]);


      //YCrCb
      *dest_scan_Y = Y;
      *dest_scan_V = V_FROM_RGB(B, G, R);
      *dest_scan_U = U_FROM_RGB(B, G, R);

      source_scan += 3;
      dest_scan_Y += 1;
      dest_scan_U += 1;
      dest_scan_V += 1;
      source_scan_next += 6;
    };

    //scroll to next line
    source += source_scan_size;
    dest_Y += dest_scan_size_Y;
    dest_U += dest_scan_size_U;
    dest_V += dest_scan_size_V;

    //Start of line
    source_scan = source;
    dest_scan_Y = dest_Y;

    //Do all pixels
    for (int x = 0; x < half_width; x ++)
    {
      int R = source_scan[0];
      int G = source_scan[1];
      int B = source_scan[2];

      //Y
      int Y = Y_FROM_RGB(B, G, R);

      *dest_scan_Y = Y;
      source_scan += 3;
      dest_scan_Y += 1;

      R = source_scan[0];
      G = source_scan[1];
      B = source_scan[2];

      //Y
      Y = Y_FROM_RGB(B, G, R);
      *dest_scan_Y = Y;
      source_scan += 3;
      dest_scan_Y += 1;
    };

    source += source_scan_size;
    dest_Y += dest_scan_size_Y;
  };
};

Unless I am missing something the follow code seems to be repeated in both loops, so, why not go through this loop once? This may require some changes to your algorithm, but it would improve performance.

for (int x = 0; x < half_width; x ++) 
{ 
  int R = source_scan[0]; 
  int G = source_scan[1]; 
  int B = source_scan[2]; 

  //Y 
  int Y = Y_FROM_RGB(B, G, R); 

  *dest_scan_Y = Y; 
  source_scan += 3; 
  dest_scan_Y += 1; 

  R = source_scan[0]; 
  G = source_scan[1]; 
  B = source_scan[2];

But, before doing anything, move the two inside loops into separate functions, and then run your profiler, and see if you spend more time in one function than the other.

You have three loops in this function, and you don't know which section is actually where you are spending your time. So determine that before doing any optimization, otherwise you may find that you are fixing the wrong section.

I don't know what platform you are using but you might want to look SIMD

Arm Cotext-A8 has Neon technology that does support SIMD. You should be able to find more information on the ARM website.

Presuming that the memory they point to does not overlap, you should declare your source, dest_Y, dest_U and dest_V pointers with the restrict qualifier, to tell the compiler this and allow it to optimise better.