I'm a CUDA beginner. I have a pixel buffer of unsigned chars in global memory that can and is updated by any and all threads. To avoid weirdness in the pixel values, therefore, I want to perform an atomicExch when a thread attempts to update one. But the 开发者_如何转开发programming guide says that this function only works on 32- or 64-bit words, whereas I just want to atomically exchange one 8-bit byte. Is there a way to do this?
Thanks.
I just ran into this problem recently. In theory, atomic operations / optimistic retries are supposed to be faster than locks/mutexes, so the "hack" solutions that use atomic operations on other data types seem better to me than using critical sections.
Here are some implementations based on the threads for how to implement atomicMin for char and atomicAdd for short.
I've tested all of these, and my tests seem to show that they work fine so far.
Version 1 of atomicAdd for char
__device__ static inline char atomicAdd(char* address, char val) {
// offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
size_t long_address_modulo = (size_t) address & 3;
// the 32-bit address that overlaps the same memory
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
// A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
// The "4" signifies the position where the first byte of the second argument will end up in the output.
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
// for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
unsigned int selector = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
long_old = *base_address;
do {
long_assumed = long_old;
// replace bits in long_old that pertain to the char address with those from val
long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
replacement = __byte_perm(long_old, long_val, selector);
long_old = atomicCAS(base_address, long_assumed, replacement);
} while (long_old != long_assumed);
return __byte_perm(long_old, 0, long_address_modulo);
}
atomicCAS for char
__device__ static inline char atomicCAS(char* address, char expected, char desired) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
unsigned int sel = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
char old;
long_val = (unsigned int) desired;
long_old = *base_address;
do {
long_assumed = long_old;
replacement = __byte_perm(long_old, long_val, sel);
long_old = atomicCAS(base_address, long_assumed, replacement);
old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
} while (expected == old && long_assumed != long_old);
return old;
}
Version 2 of atomicAdd for char (uses bit shifts instead of __byte_perm and has to handle overflow as a result)
__device__ static inline char atomicAdd2(char* address, char val) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int long_val = (unsigned int) val << (8 * long_address_modulo);
unsigned int long_old = atomicAdd(base_address, long_val);
if (long_address_modulo == 3) {
// the first 8 bits of long_val represent the char value,
// hence the first 8 bits of long_old represent its previous value.
return (char) (long_old >> 24);
} else {
// bits that represent the char value within long_val
unsigned int mask = 0x000000ff << (8 * long_address_modulo);
unsigned int masked_old = long_old & mask;
// isolate the bits that represent the char value within long_old, add the long_val to that,
// then re-isolate by excluding bits that represent the char value
unsigned int overflow = (masked_old + long_val) & ~mask;
if (overflow) {
atomicSub(base_address, overflow);
}
return (char) (masked_old >> 8 * long_address_modulo);
}
}
For atomicMin, please check this thread.
You might implement a critical section using a mutex variable. So something like
get_the_lock
exch_data
release
http://forums.nvidia.com/index.php?showtopic=185809
Implementing a critical section in CUDA
The other answer has a bug in its implementation of atomicCAS()
. This version works for me:
__device__
static inline
uint8_t
atomicCAS( uint8_t * const address,
uint8_t const compare,
uint8_t const value )
{
// Determine where in a byte-aligned 32-bit range our address of 8 bits occurs.
uint8_t const longAddressModulo = reinterpret_cast< size_t >( address ) & 0x3;
// Determine the base address of the byte-aligned 32-bit range that contains our address of 8 bits.
uint32_t * const baseAddress = reinterpret_cast< uint32_t * >( address - longAddressModulo );
uint32_t constexpr byteSelection[] = { 0x3214, 0x3240, 0x3410, 0x4210 }; // The byte position we work on is '4'.
uint32_t const byteSelector = byteSelection[ longAddressModulo ];
uint32_t const longCompare = compare;
uint32_t const longValue = value;
uint32_t longOldValue = * baseAddress;
uint32_t longAssumed;
uint8_t oldValue;
do
{
// Select bytes from the old value and new value to construct a 32-bit value to use.
uint32_t const replacement = __byte_perm( longOldValue, longValue, byteSelector );
uint32_t const comparison = __byte_perm( longOldValue, longCompare, byteSelector );
longAssumed = longOldValue;
// Use 32-bit atomicCAS() to try and set the 8-bits we care about.
longOldValue = ::atomicCAS( baseAddress, comparison, replacement );
// Grab the 8-bit portion we care about from the old value at address.
oldValue = ( longOldValue >> ( 8 * longAddressModulo )) & 0xFF;
}
while ( compare == oldValue and longAssumed != longOldValue ); // Repeat until other three 8-bit values stabilize.
return oldValue;
}
精彩评论