#ifndef BOOST_DETAIL_ATOMIC_LINUX_ARM_HPP
#define BOOST_DETAIL_ATOMIC_LINUX_ARM_HPP

//  Distributed under the Boost Software License, Version 1.0.
//  See accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)
//
//  Copyright (c) 2009 Helge Bahmann
//  Copyright (c) 2009 Phil Endecott
//  ARM Code by Phil Endecott, based on other architectures.

#include <boost/memory_order.hpp>
#include <boost/atomic/detail/base.hpp>
#include <boost/atomic/detail/builder.hpp>

namespace boost {
namespace detail {
namespace atomic {


// Different ARM processors have different atomic instructions.  In particular, 
// architecture versions before v6 (which are still in widespread use, e.g. the 
// Intel/Marvell XScale chips like the one in the NSLU2) have only atomic swap.  
// On Linux the kernel provides some support that lets us abstract away from 
// these differences: it provides emulated CAS and barrier functions at special 
// addresses that are garaunteed not to be interrupted by the kernel.  Using 
// this facility is slightly slower than inline assembler would be, but much 
// faster than a system call.
//
// For documentation, see arch/arm/kernel/entry-armv.S in the kernel source 
// (search for "User Helpers").


typedef void (kernel_dmb_t)(void);
#define BOOST_ATOMIC_KERNEL_DMB (*(kernel_dmb_t *)0xffff0fa0)

static inline void fence_before(memory_order order)
{
    switch(order) {
        // FIXME I really don't know which of these cases should call
        // kernel_dmb() and which shouldn't...
        case memory_order_consume:
        case memory_order_release:
        case memory_order_acq_rel:
        case memory_order_seq_cst:
            BOOST_ATOMIC_KERNEL_DMB();
        default:;
    }
}

static inline void fence_after(memory_order order)
{
    switch(order) {
        // FIXME I really don't know which of these cases should call
        // kernel_dmb() and which shouldn't...
        case memory_order_acquire:
        case memory_order_acq_rel:
        case memory_order_seq_cst:
            BOOST_ATOMIC_KERNEL_DMB();
        default:;
    }
}

#undef BOOST_ATOMIC_KERNEL_DMB


template<typename T>
class atomic_linux_arm_4 {

//    typedef int (kernel_cmpxchg_t)(T oldval, T newval, T *ptr);
    typedef int (kernel_cmpxchg_t)(T oldval, T newval, volatile T *ptr);
#    define BOOST_ATOMIC_KERNEL_CMPXCHG (*(kernel_cmpxchg_t *)0xffff0fc0)
    // Returns 0 if *ptr was changed.

public:
    explicit atomic_linux_arm_4(T v) : i(v) {}
    atomic_linux_arm_4() {}
    T load(memory_order order=memory_order_seq_cst) const volatile
    {
        T v=const_cast<volatile const T &>(i);
        fence_after(order);
        return v;
    }
    void store(T v, memory_order order=memory_order_seq_cst) volatile
    {
        fence_before(order);
        const_cast<volatile T &>(i)=v;
    }
    bool compare_exchange_strong(
                T &expected,
                T desired,
                memory_order success_order,
                memory_order failure_order) volatile
        {
        // Aparently we can consider kernel_cmpxchg to be strong if it is retried
        // by the kernel after being interrupted, which I think it is.
        // Also it seems that when an ll/sc implementation is used the kernel
        // loops until the store succeeds.
        bool success = BOOST_ATOMIC_KERNEL_CMPXCHG(expected,desired,&i)==0;
        if (!success) expected = load(memory_order_relaxed);
        return success;
    }
    bool compare_exchange_weak(
                T &expected,
                T desired,
                memory_order success_order,
                memory_order failure_order) volatile
        {
                return compare_exchange_strong(expected, desired, success_order, failure_order);
    }
    T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile
    {
        // Copied from build_exchange.
                T o=load(memory_order_relaxed);
                do {} while(!compare_exchange_weak(o, replacement, order, order));
                return o;
        // Note that ARM has an atomic swap instruction that we could use here:
        //   T oldval;
        //   asm volatile ("swp\t%0, %1, [%2]" : "=&r"(oldval) : "r" (replacement), "r" (&i) : "memory");
        //   return oldval;
        // This instruction is deprecated in architecture >= 6.  I'm unsure how inefficient
        // its implementation is on those newer architectures.  
        // I don't think this would gain
        // much since exchange() is not used often.
    }
    
    bool is_lock_free(void) const volatile {return true;}
    typedef T integral_type;
protected:
private:
    T i;

#    undef BOOST_ATOMIC_KERNEL_CMPXCHG

};

template<typename T>
class platform_atomic_integral<T, 4> : public build_atomic_from_exchange<atomic_linux_arm_4<T> > {
public:
    typedef build_atomic_from_exchange<atomic_linux_arm_4<T> > super;
    explicit platform_atomic_integral(T v) : super(v) {}
    platform_atomic_integral(void) {}
};


template<typename T>
class platform_atomic_integral<T, 1> : public build_atomic_from_larger_type<atomic_linux_arm_4<uint32_t>, T > {
public:
    typedef build_atomic_from_larger_type<atomic_linux_arm_4<uint32_t>, T> super;
    explicit platform_atomic_integral(T v) : super(v) {}
    platform_atomic_integral(void) {}
};


template<typename T>
class platform_atomic_integral<T, 2> : public build_atomic_from_larger_type<atomic_linux_arm_4<uint32_t>, T > {
public:
    typedef build_atomic_from_larger_type<atomic_linux_arm_4<uint32_t>, T> super;
    explicit platform_atomic_integral(T v) : super(v) {}
    platform_atomic_integral(void) {}
};


typedef atomic_linux_arm_4<void *> platform_atomic_address;


}
}
}

#endif