I was recently tasked with improving some embedded code where a disproportionate amount of time was being spent in malloc
.
The client wants an easy fix that could be effected with some very simple search and replace commands (i.e., no massive, complex changes to the source code) while delivering a substantial benefit.
Upon analysis, 开发者_如何学编程it looks like the vast majority (~80%) of allocations were for 128 bytes or less.
To that end, I put together some proof of concept code using a quickpool, the idea being that one allocation from malloc
is done at start up (enough to hold, for example, 1024 blocks of 128 bytes each) and allocations wehich could be satisfied from this quickpool would be. I was hoping that removing the more complex calculations in malloc
(more complex since it has to handle varying size blocks) would result in a speed increase.
Then, all source code is changed to use the quickpool instead of regular malloc
. In the event that the quickpool was exhausted or the allocation required more than the block size, the request would be passed through to malloc
.
The results weren't bad, giving me a (roughly) 50% reduction in time but I'm wondering if I can do better.
The quickpool functions are shown below. First the header file:
#ifndef QUICKPOOL_H_INCLUDED
#define QUICKPOOL_H_INCLUDED
#include <stdlib.h>
typedef struct qp_pool_s qp_pool;
qp_pool *qp_create (size_t, size_t);
qp_pool *qp_destroy (qp_pool *);
void *qp_malloc (qp_pool *, size_t);
void *qp_free (qp_pool *, void *);
#endif
The create
and destroy
functions allow for different pools to be created with different block sizes and block counts:
#include <string.h>
#include "quickpool.h"
// Various global values.
#define DEFLT_BLKS 1024
#define DEFLT_SZ 128
struct qp_pool_s {
char *data; // Actual blocks.
char *enddata; // First byte beyond data.
char *isused; // Used map for blocks.
size_t total_blks; // Number of blocks in pool.
size_t free_blks; // Free blocks in pool.
size_t blk_sz; // Size of each block.
size_t low_free; // Lowest free block.
};
qp_pool *qp_create (size_t quant, size_t sz) {
// Zero means a default size.
if (quant == 0) quant = DEFLT_BLKS;
if (sz == 0) sz = DEFLT_SZ;
// Allocate memory blocks for pool.
qp_pool *pool = malloc (sizeof (*pool));
if (pool == NULL) return NULL;
if ((pool->data = malloc (quant * sz)) == NULL) {
free (pool);
return NULL;
}
if ((pool->isused = malloc (quant)) == NULL) {
free (pool->data);
free (pool);
return NULL;
}
// Set information on pool and return it.
pool->enddata = &(pool->data[quant * sz]);
memset (pool->isused, 0, quant);
pool->total_blks = quant;
pool->free_blks = quant;
pool->blk_sz = sz;
pool->low_free = 0;
return pool;
}
qp_pool *qp_destroy (qp_pool *pool) {
// Just free all the memory for pool.
if (pool != NULL) {
free (pool->data);
free (pool->isused);
free (pool);
}
return NULL;
}
And then there are the malloc
and free
counterparts:
void *qp_malloc (qp_pool *pool, size_t sz) {
int index;
// If no pool, need more than BUFSZ bytes or pool empty, use default.
if (pool == NULL) return malloc (sz);
if ((sz > pool->blk_sz) || (pool->free_blks == 0))
return malloc (sz);
// Otherwise, get from quickpool. First we find a free block.
for (index = pool->low_free; pool->isused[index]; index++)
;
// Then we mark it used.
pool->isused[index] = 1;
pool->free_blks--;
// Set lowest possible free block for speeding up next search.
pool->low_free = index + 1;
return &(pool->data[index * pool->blk_sz]);
}
void *qp_free (qp_pool *pool, void *ptr) {
int index;
// No pool created, use default.
if (pool == NULL) {
free (ptr);
return NULL;
}
// Not in quick pool, use default.
if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) {
free (ptr);
return NULL;
}
// This is a quickpool address, free it.
index = ((char*)ptr - pool->data) / pool->blk_sz;
pool->isused[index] = 0;
pool->free_blks++;
// Optimise next search.
if (index < pool->low_free)
pool->low_free = index;
return NULL;
}
For completeness, the main program for testing is below:
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "quickpool.h"
#define FREE 0
#define ALLOC 1
#define NUMPTRS 512
static void *pointer[NUMPTRS];
static size_t numPointers = 0;
int main (int argCount, char *argVal[]) {
int count, val, index, memsz, stMode, seed;
qp_pool *quickPool;
seed = atoi (argVal[1]);
stMode = (strcmp (argVal[2], "standard") == 0);
srand (seed);
int baseline = clock();
quickPool = qp_create (0, 0);
for (count = 0; count < 1000000; count++) {
if (numPointers == 0)
val = ALLOC;
else if (numPointers == NUMPTRS)
val = FREE;
else if (numPointers > NUMPTRS/2)
val = ((rand() % 100) < 50) ? FREE : ALLOC;
else
val = ((rand() % 100) < 33) ? FREE : ALLOC;
if (val == FREE) {
index = rand() % numPointers;
if (stMode)
free (pointer[index]);
else
qp_free (quickPool, pointer[index]);
pointer[index] = pointer[--numPointers];
} else {
memsz = rand() % 160;
if (stMode)
pointer[numPointers++] = malloc (memsz);
else
pointer[numPointers++] = qp_malloc (quickPool, memsz);
}
}
quickPool = qp_destroy (quickPool);
baseline = clock() - baseline;
printf ("%d\n", baseline * 1000 / CLOCKS_PER_SEC);
return 0;
}
along with a shell script for analysis:
#!/usr/bin/bash
normal=0
quick=0
printf " %10s %10s\n" Normal Quick
printf " ========== ==========\n"
for iter1 in 0 1 ; do
for iter2 in 0 1 2 3 4 5 6 7 8 9 ; do
seed=$RANDOM
val=$(./qptest.exe $seed standard)
printf "${iter1}${iter2} %10d " $val
((normal = normal + val))
val=$(./qptest.exe $seed quickpool)
printf "%10d\n" $val
((quick = quick + val))
done
done
printf " ========== ==========\n"
((pct = quick * 100 / normal))
printf "sum %10d %10d (%d%%)\n" $normal $quick $pct
which outputs:
Normal Quick
========== ==========
00 469 219
01 453 219
02 453 235
03 453 219
04 453 235
05 453 219
06 469 219
07 453 234
08 453 219
09 453 219
10 453 219
11 453 235
12 453 235
13 453 219
14 453 219
15 453 235
16 453 235
17 453 235
18 469 219
19 469 234
========== ==========
sum 9124 4522 (49%)
Now my question is as follows: is there any other scope for optimising the quickpool code that doesn't rely on bypassing the requirements:
- an easy-to-integrate fix, requiring only simple global-search-and-replace on the souce code.
- the ability to have different sized pools (# of blocks) with different sized blocks.
- dropping through to
malloc
if the quick pool cannot satisfy the request.
I have obtained a good speedup over your version (approx 25% on my hardware) while maintaining the existing interfaces by using a free list instead of a free map.
As a bonus, the code even gets simpler:
#include <string.h>
#include "quickpool.h"
// Various global values.
#define DEFLT_BLKS 1024
#define DEFLT_SZ 128
struct qp_pool_s {
char *data; // Actual blocks.
char *enddata; // First byte beyond data.
size_t blk_sz; // Size of each block.
void *next_free; // Next free block
};
qp_pool *qp_create (size_t quant, size_t sz)
{
char *blk;
// Zero means a default size. sizeof(void *) is minimum block size.
if (quant == 0) quant = DEFLT_BLKS;
if (sz == 0)
sz = DEFLT_SZ;
/* Round up size to a multiple of sizeof(void *) */
sz = (sz + sizeof(void *) - 1) & ~(sizeof(void *) - 1);
// Allocate memory blocks for pool.
qp_pool *pool = malloc (sizeof (*pool));
if (pool == NULL) return NULL;
if ((pool->data = malloc (quant * sz)) == NULL) {
free (pool);
return NULL;
}
/* Set up free chain */
for (blk = pool->data; blk < &pool->data[(quant - 1) * sz]; blk += sz)
*(void **)blk = blk + sz;
*(void **)blk = NULL;
pool->next_free = pool->data;
// Set information on pool and return it.
pool->enddata = &(pool->data[quant * sz]);
pool->blk_sz = sz;
return pool;
}
qp_pool *qp_destroy (qp_pool *pool) {
// Just free all the memory for pool.
if (pool != NULL) {
free (pool->data);
free (pool);
}
return NULL;
}
void *qp_malloc (qp_pool *pool, size_t sz) {
void *blk;
// If no pool, need more than BUFSZ bytes or pool empty, use default.
if (pool == NULL) return malloc (sz);
if ((sz > pool->blk_sz) || (pool->next_free == NULL))
return malloc (sz);
// Otherwise, get from quickpool. First we find a free block.
blk = pool->next_free;
// Then we mark it used.
pool->next_free = *(void **)blk;
return blk;
}
void *qp_free (qp_pool *pool, void *ptr) {
// No pool created, use default.
if (pool == NULL) {
free (ptr);
return NULL;
}
// Not in quick pool, use default.
if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) {
free (ptr);
return NULL;
}
// This is a quickpool address, free it.
*(void **)ptr = pool->next_free;
pool->next_free = ptr;
return NULL;
}
Results (my system malloc
is obviously quite fast):
Normal Quick Caf
========== ========== ==========
00 210 140 100
01 130 140 100
02 130 130 100
03 130 140 100
04 130 130 100
05 130 130 90
06 130 140 100
07 130 130 100
08 130 140 100
09 130 140 100
10 120 140 100
11 120 140 100
12 130 130 100
13 130 140 100
14 120 130 110
15 130 140 100
16 130 130 100
17 130 140 100
18 130 130 100
19 130 140 100
========== ========== ==========
sum 2650 2720 2000
( 102%) ( 75%)
If you can afford to spend a little bit of memory (say, a pointer per block) then you can keep the free blocks in a LIFO stack and eliminate the searches in qp_malloc() and qp_free(). It does make the code slightly more complicated but assures an O(1) time for all allocations.
You could store the used/unused block flag as single bits in an array of bytes - assuming a 32bit system you could store the 1024 bits in 32 ints.
Then finding a spare block is just a matter of walking over 32 ints looking for one that isn't 0xffffffff
At this point I think you ought to be using a code profiling tool, like gprof, to figure out what parts of your new code are really costing you the most time. It might also be worth running a profile on the whole program to figure out where to spend your time optimizing.
精彩评论