351 lines
9.8 KiB
C++
351 lines
9.8 KiB
C++
#ifndef BOOST_DETAIL_ATOMIC_GCC_PPC_HPP
|
|
#define BOOST_DETAIL_ATOMIC_GCC_PPC_HPP
|
|
|
|
// Copyright (c) 2009 Helge Bahmann
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// See accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
#include <boost/atomic/detail/base.hpp>
|
|
#include <boost/atomic/detail/builder.hpp>
|
|
|
|
/*
|
|
Refer to: Motorola: "Programming Environments Manual for 32-Bit
|
|
Implementations of the PowerPC Architecture", Appendix E:
|
|
"Synchronization Programming Examples" for an explanation of what is
|
|
going on here (can be found on the web at various places by the
|
|
name "MPCFPE32B.pdf", Google is your friend...)
|
|
*/
|
|
|
|
namespace boost {
|
|
namespace detail {
|
|
namespace atomic {
|
|
|
|
static inline void fence_before(memory_order order)
|
|
{
|
|
switch(order) {
|
|
case memory_order_release:
|
|
case memory_order_acq_rel:
|
|
#if defined(__powerpc64__)
|
|
__asm__ __volatile__ ("lwsync" ::: "memory");
|
|
break;
|
|
#endif
|
|
case memory_order_seq_cst:
|
|
__asm__ __volatile__ ("sync" ::: "memory");
|
|
default:;
|
|
}
|
|
}
|
|
|
|
/* Note on the barrier instructions used by fence_after and
|
|
atomic_thread_fence: the "isync" instruction normally does
|
|
not wait for memory-accessing operations to complete, the
|
|
"trick" is to introduce a conditional branch that formally
|
|
depends on the memory-accessing instruction -- isync waits
|
|
until the branch can be resolved and thus implicitly until
|
|
the memory access completes.
|
|
|
|
This means that the load(memory_order_relaxed) instruction
|
|
includes this branch, even though no barrier would be required
|
|
here, but as a consequence atomic_thread_fence(memory_order_acquire)
|
|
would have to be implemented using "sync" instead of "isync".
|
|
The following simple cost-analysis provides the rationale
|
|
for this decision:
|
|
|
|
- isync: about ~12 cycles
|
|
- sync: about ~50 cycles
|
|
- "spurious" branch after load: 1-2 cycles
|
|
- making the right decision: priceless
|
|
|
|
*/
|
|
|
|
static inline void fence_after(memory_order order)
|
|
{
|
|
switch(order) {
|
|
case memory_order_acquire:
|
|
case memory_order_acq_rel:
|
|
case memory_order_seq_cst:
|
|
__asm__ __volatile__ ("isync");
|
|
case memory_order_consume:
|
|
__asm__ __volatile__ ("" ::: "memory");
|
|
default:;
|
|
}
|
|
}
|
|
|
|
template<>
|
|
inline void platform_atomic_thread_fence(memory_order order)
|
|
{
|
|
switch(order) {
|
|
case memory_order_acquire:
|
|
__asm__ __volatile__ ("isync" ::: "memory");
|
|
break;
|
|
case memory_order_release:
|
|
case memory_order_acq_rel:
|
|
#if defined(__powerpc64__)
|
|
__asm__ __volatile__ ("lwsync" ::: "memory");
|
|
break;
|
|
#endif
|
|
case memory_order_seq_cst:
|
|
__asm__ __volatile__ ("sync" ::: "memory");
|
|
default:;
|
|
}
|
|
}
|
|
|
|
|
|
/* note: the __asm__ constraint "b" instructs gcc to use any register
|
|
except r0; this is required because r0 is not allowed in
|
|
some places. Since I am sometimes unsure if it is allowed
|
|
or not just play it safe and avoid r0 entirely -- ppc isn't
|
|
exactly register-starved, so this really should not matter :) */
|
|
|
|
template<typename T>
|
|
class atomic_ppc_32 {
|
|
public:
|
|
typedef T integral_type;
|
|
explicit atomic_ppc_32(T v) : i(v) {}
|
|
atomic_ppc_32() {}
|
|
T load(memory_order order=memory_order_seq_cst) const volatile
|
|
{
|
|
T v=*reinterpret_cast<volatile const T *>(&i);
|
|
__asm__ __volatile__ (
|
|
"cmpw %0, %0\n"
|
|
"bne- 1f\n"
|
|
"1f:\n"
|
|
: "+b"(v));
|
|
fence_after(order);
|
|
return v;
|
|
}
|
|
void store(T v, memory_order order=memory_order_seq_cst) volatile
|
|
{
|
|
fence_before(order);
|
|
*reinterpret_cast<volatile T *>(&i)=v;
|
|
}
|
|
bool compare_exchange_weak(
|
|
T &expected,
|
|
T desired,
|
|
memory_order success_order,
|
|
memory_order failure_order) volatile
|
|
{
|
|
fence_before(success_order);
|
|
int success;
|
|
__asm__ __volatile__(
|
|
"lwarx %0,0,%2\n"
|
|
"cmpw %0, %3\n"
|
|
"bne- 2f\n"
|
|
"stwcx. %4,0,%2\n"
|
|
"bne- 2f\n"
|
|
"addi %1,0,1\n"
|
|
"1:"
|
|
|
|
".subsection 2\n"
|
|
"2: addi %1,0,0\n"
|
|
"b 1b\n"
|
|
".previous\n"
|
|
: "=&b" (expected), "=&b" (success)
|
|
: "b" (&i), "b" (expected), "b" ((int)desired)
|
|
);
|
|
if (success) fence_after(success_order);
|
|
else fence_after(failure_order);
|
|
return success;
|
|
}
|
|
|
|
bool is_lock_free(void) const volatile {return true;}
|
|
protected:
|
|
inline T fetch_add_var(T c, memory_order order) volatile
|
|
{
|
|
fence_before(order);
|
|
T original, tmp;
|
|
__asm__ __volatile__(
|
|
"1: lwarx %0,0,%2\n"
|
|
"add %1,%0,%3\n"
|
|
"stwcx. %1,0,%2\n"
|
|
"bne- 1b\n"
|
|
: "=&b" (original), "=&b" (tmp)
|
|
: "b" (&i), "b" (c)
|
|
: "cc");
|
|
fence_after(order);
|
|
return original;
|
|
}
|
|
inline T fetch_inc(memory_order order) volatile
|
|
{
|
|
fence_before(order);
|
|
T original, tmp;
|
|
__asm__ __volatile__(
|
|
"1: lwarx %0,0,%2\n"
|
|
"addi %1,%0,1\n"
|
|
"stwcx. %1,0,%2\n"
|
|
"bne- 1b\n"
|
|
: "=&b" (original), "=&b" (tmp)
|
|
: "b" (&i)
|
|
: "cc");
|
|
fence_after(order);
|
|
return original;
|
|
}
|
|
inline T fetch_dec(memory_order order) volatile
|
|
{
|
|
fence_before(order);
|
|
T original, tmp;
|
|
__asm__ __volatile__(
|
|
"1: lwarx %0,0,%2\n"
|
|
"addi %1,%0,-1\n"
|
|
"stwcx. %1,0,%2\n"
|
|
"bne- 1b\n"
|
|
: "=&b" (original), "=&b" (tmp)
|
|
: "b" (&i)
|
|
: "cc");
|
|
fence_after(order);
|
|
return original;
|
|
}
|
|
private:
|
|
T i;
|
|
};
|
|
|
|
#if defined(__powerpc64__)
|
|
|
|
#warning Untested code -- please inform me if it works
|
|
|
|
template<typename T>
|
|
class atomic_ppc_64 {
|
|
public:
|
|
typedef T integral_type;
|
|
explicit atomic_ppc_64(T v) : i(v) {}
|
|
atomic_ppc_64() {}
|
|
T load(memory_order order=memory_order_seq_cst) const volatile
|
|
{
|
|
T v=*reinterpret_cast<volatile const T *>(&i);
|
|
__asm__ __volatile__ (
|
|
"cmpw %0, %0\n"
|
|
"bne- 1f\n"
|
|
"1f:\n"
|
|
: "+b"(v));
|
|
fence_after(order);
|
|
return v;
|
|
}
|
|
void store(T v, memory_order order=memory_order_seq_cst) volatile
|
|
{
|
|
fence_before(order);
|
|
*reinterpret_cast<volatile T *>(&i)=v;
|
|
}
|
|
bool compare_exchange_weak(
|
|
T &expected,
|
|
T desired,
|
|
memory_order success_order,
|
|
memory_order failure_order) volatile
|
|
{
|
|
fence_before(success_order);
|
|
int success;
|
|
__asm__ __volatile__(
|
|
"ldarx %0,0,%2\n"
|
|
"cmpw %0, %3\n"
|
|
"bne- 2f\n"
|
|
"stdcx. %4,0,%2\n"
|
|
"bne- 2f\n"
|
|
"addi %1,0,1\n"
|
|
"1:"
|
|
|
|
".subsection 2\n"
|
|
"2: addi %1,0,0\n"
|
|
"b 1b\n"
|
|
".previous\n"
|
|
: "=&b" (expected), "=&b" (success)
|
|
: "b" (&i), "b" (expected), "b" ((int)desired)
|
|
);
|
|
if (success) fence_after(success_order);
|
|
else fence_after(failure_order);
|
|
fence_after(order);
|
|
return success;
|
|
}
|
|
|
|
bool is_lock_free(void) const volatile {return true;}
|
|
protected:
|
|
inline T fetch_add_var(T c, memory_order order) volatile
|
|
{
|
|
fence_before(order);
|
|
T original, tmp;
|
|
__asm__ __volatile__(
|
|
"1: ldarx %0,0,%2\n"
|
|
"add %1,%0,%3\n"
|
|
"stdcx. %1,0,%2\n"
|
|
"bne- 1b\n"
|
|
: "=&b" (original), "=&b" (tmp)
|
|
: "b" (&i), "b" (c)
|
|
: "cc");
|
|
fence_after(order);
|
|
return original;
|
|
}
|
|
inline T fetch_inc(memory_order order) volatile
|
|
{
|
|
fence_before(order);
|
|
T original, tmp;
|
|
__asm__ __volatile__(
|
|
"1: ldarx %0,0,%2\n"
|
|
"addi %1,%0,1\n"
|
|
"stdcx. %1,0,%2\n"
|
|
"bne- 1b\n"
|
|
: "=&b" (original), "=&b" (tmp)
|
|
: "b" (&i)
|
|
: "cc");
|
|
fence_after(order);
|
|
return original;
|
|
}
|
|
inline T fetch_dec(memory_order order) volatile
|
|
{
|
|
fence_before(order);
|
|
T original, tmp;
|
|
__asm__ __volatile__(
|
|
"1: ldarx %0,0,%2\n"
|
|
"addi %1,%0,-1\n"
|
|
"stdcx. %1,0,%2\n"
|
|
"bne- 1b\n"
|
|
: "=&b" (original), "=&b" (tmp)
|
|
: "b" (&i)
|
|
: "cc");
|
|
fence_after(order);
|
|
return original;
|
|
}
|
|
private:
|
|
T i;
|
|
};
|
|
#endif
|
|
|
|
template<typename T>
|
|
class platform_atomic_integral<T, 4> : public build_atomic_from_typical<build_exchange<atomic_ppc_32<T> > > {
|
|
public:
|
|
typedef build_atomic_from_typical<build_exchange<atomic_ppc_32<T> > > super;
|
|
explicit platform_atomic_integral(T v) : super(v) {}
|
|
platform_atomic_integral(void) {}
|
|
};
|
|
|
|
template<typename T>
|
|
class platform_atomic_integral<T, 1>: public build_atomic_from_larger_type<atomic_ppc_32<uint32_t>, T> {
|
|
public:
|
|
typedef build_atomic_from_larger_type<atomic_ppc_32<uint32_t>, T> super;
|
|
|
|
explicit platform_atomic_integral(T v) : super(v) {}
|
|
platform_atomic_integral(void) {}
|
|
};
|
|
|
|
template<typename T>
|
|
class platform_atomic_integral<T, 2>: public build_atomic_from_larger_type<atomic_ppc_32<uint32_t>, T> {
|
|
public:
|
|
typedef build_atomic_from_larger_type<atomic_ppc_32<uint32_t>, T> super;
|
|
|
|
explicit platform_atomic_integral(T v) : super(v) {}
|
|
platform_atomic_integral(void) {}
|
|
};
|
|
|
|
#if defined(__powerpc64__)
|
|
template<typename T>
|
|
class platform_atomic_integral<T, 8> : public build_atomic_from_typical<build_exchange<atomic_ppc_64<T> > > {
|
|
public:
|
|
typedef build_atomic_from_typical<build_exchange<atomic_ppc_64<T> > > super;
|
|
explicit platform_atomic_integral(T v) : super(v) {}
|
|
platform_atomic_integral(void) {}
|
|
};
|
|
#endif
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|