Hello GCC List,
i am currently working on a hardware API in C++11 for ARM Cortex-M3
microcontrollers. It provides an object oriented way of accessing
hardware registers. The idea is that the user need not worry about
individual registers and their composition of bit fields but can access
these with symbolic names.
The API uses temporary objects and call chaining for syntactic sugar.
The problem is now that GCC produces correct, but way too slow and too
much code.
See the attached simplified testcase (with a dummy linker script to
shorten disassembler output) and the function getInput. When compiling
with gcc-arm-embedded ( https://launchpad.net/gcc-arm-embedded ), this
is the code generated by GCC:
00000000 <getInput()>:
0: b470 push {r4, r5, r6}
2: 2300 movs r3, #0
4: b097 sub sp, #92 ; 0x5c
6: 2204 movs r2, #4
8: 2101 movs r1, #1
a: 2603 movs r6, #3
c: 2511 movs r5, #17
e: 2412 movs r4, #18
10: 480f ldr r0, [pc, #60] ; (50 <getInput()+0x50>)
12: f88d 3000 strb.w r3, [sp]
16: 9303 str r3, [sp, #12]
18: 9304 str r3, [sp, #16]
1a: 9307 str r3, [sp, #28]
1c: 9308 str r3, [sp, #32]
1e: f88d 302c strb.w r3, [sp, #44] ; 0x2c
22: f88d 6001 strb.w r6, [sp, #1]
26: f88d 202d strb.w r2, [sp, #45] ; 0x2d
2a: f88d 2038 strb.w r2, [sp, #56] ; 0x38
2e: f88d 2039 strb.w r2, [sp, #57] ; 0x39
32: f88d 5044 strb.w r5, [sp, #68] ; 0x44
36: f88d 1045 strb.w r1, [sp, #69] ; 0x45
3a: f88d 1051 strb.w r1, [sp, #81] ; 0x51
3e: f88d 4050 strb.w r4, [sp, #80] ; 0x50
42: 6800 ldr r0, [r0, #0]
44: f3c0 4080 ubfx r0, r0, #18, #1
48: b017 add sp, #92 ; 0x5c
4a: bc70 pop {r4, r5, r6}
4c: 4770 bx lr
4e: bf00 nop
50: 00c0ffee .word 0x00c0ffee
GCC allocates a stack frame ("sub sp, #92") of size 92, fills it with
some values ("strX rY, [sp, #...]") . Then that frame is discarded ("add
sp, #92"), but the data written there is never even read. So most of the
instructions in the generated code (0-e, 12-3e, 48-4A) can just be left
out, resulting in equally functional, but faster and smaller code that
doesn't use the stack at all:
00000000 <getInput()>:
0: 4b02 ldr r3, [pc, #8] ; (c <getInput()+0xc>)
2: 6818 ldr r0, [r3, #0]
4: f3c0 4080 ubfx r0, r0, #18, #1
8: 4770 bx lr
a: bf00 nop
c: 00c0ffee .word 0x00c0ffee
The actual read access here occurs in function
"RegContainer<RegType>::getReg".
The same applies to the other functions setOutput() and main(), while
configure() looks OK.
The problem occurs with both arm-none-eabi-gcc 4.8.3, 4.9.0 and also
x86_64-linux-gnu-gcc 4.8.1 (while that code is of course of little use
on x86_64).
So, the question is - is that behaviour a result of a bug in my code or
a GCC bug? Is that a known problem?
Thanks in advance,
Niklas Gürtler
/*
* Testcase for possible GCC bug.
* This code targets a Cortex-M3 (ARMv7M) mikrocontroller. On that platform, hardware registers are
* essentially "special" memory locations which can be accessed by writing to certain addresses.
*
* This code aims at providing a nice and efficient object-oriented API for peripheral functions.
* This concrete dummy example configures general purpose I/O pins.
*/
#include <cstdint>
#include <limits>
#include <utility>
#include <type_traits>
/*
* Generic code for I/O access
*/
// For less typing
#define my_always_inline __attribute__((always_inline)) inline
/**
* Instances of RegContainer should only be temporary. It encapsulates access to registers by
* functions that provide access to bitfields within one register. Multiple read accesses to
* one register are avoided by using a cache. Write accesses are combined into one write that
* is performed upon destruction. The modMask field is used to only overwrite changed
* bitfields and keep unchanged areas.
*/
template <typename RegType>
class RegContainer {
public:
my_always_inline RegContainer (uintptr_t addr);
my_always_inline RegContainer (RegContainer&& src);
my_always_inline RegContainer (const RegContainer&) = delete;
my_always_inline RegContainer& operator = (const RegContainer&) = delete;
my_always_inline RegContainer& operator = (RegContainer&& src) = delete;
/// Performs the write access if neccessary
my_always_inline ~RegContainer ();
/// Read a part of the register
my_always_inline RegType getField (uint8_t bit, uint8_t size);
/// Write a part of the register
my_always_inline void setField (uint8_t bit, uint8_t size, RegType value);
private:
/// Perform read access
my_always_inline RegType getReg ();
/// Address of the register
uintptr_t m_address;
/// Whether the read access has already been performed
bool m_loaded;
RegType m_cache, ///^ Both the read register and the values to write are stored here.
m_modMask; ///^ Areas in m_cache where m_modMask is '1' have been modified
};
/**
* Provides the access layer for the user code. Write/Read accesses are redirected to an
* instance of RegContainer via setField/getField and the appropriate bit/size specifications.
* Essentially represents a field in a register.
*/
template <typename ParamType_, typename Container_, typename Object_>
class ParamProxy {
public:
using ParamType = ParamType_;
using Container = Container_;
using Object = Object_;
my_always_inline constexpr ParamProxy (Container cont, Object obj, uint8_t bit, uint8_t size);
my_always_inline constexpr ParamProxy (const ParamProxy&) = delete;
my_always_inline ParamProxy (ParamProxy&&) = delete;
my_always_inline ParamProxy& operator = (const ParamProxy&) = delete;
my_always_inline ParamProxy& operator = (ParamProxy&&) = delete;
my_always_inline ~ParamProxy () = default;
my_always_inline Object set (const ParamType& p);
my_always_inline ParamType get ();
my_always_inline Object get (ParamType& out);
/// Alias for set
my_always_inline Object operator () (const ParamType& p);
/// Alias for get
my_always_inline ParamType operator () ();
/// Alias for set
my_always_inline ParamType operator = (const ParamType& p);
/// Alias for get
my_always_inline ParamType operator* ();
private:
Container m_cont;
Object m_object;
uint8_t m_bit, m_size;
};
/*
* Concrete example for configuring GPIO pins
*/
/**
* Temporary object for configuring parameters for GPIO pins.
*/
class PinFields {
private:
/// Port & Pin of the hardware pin
uint8_t m_iPort, m_iPin;
/// Registers to be accessed.
RegContainer<uint32_t> m_cCRL, m_cCRH;
public:
my_always_inline PinFields (uint8_t iPort, uint8_t iPin);
my_always_inline PinFields (const PinFields&) = delete;
my_always_inline PinFields (PinFields&& src);
my_always_inline PinFields& operator = (const PinFields&) = delete;
my_always_inline PinFields& operator = (PinFields&&) = delete;
my_always_inline ~PinFields () = default;
// Some dummy example parameters.
ParamProxy<uint32_t, RegContainer<uint32_t>&, PinFields&> param1;
ParamProxy<uint32_t, RegContainer<uint32_t>&, PinFields&> param2;
ParamProxy<bool, RegContainer<uint32_t>&, PinFields&> out;
ParamProxy<bool, RegContainer<uint32_t>&, PinFields&> in;
};
/**
* Represents a hardware pin.
*/
class Pin {
public:
my_always_inline constexpr Pin (uint8_t iPort, uint8_t iPin);
my_always_inline constexpr Pin (const Pin&) = default;
my_always_inline Pin (Pin&&) = default;
my_always_inline Pin& operator = (const Pin&) = default;
my_always_inline Pin& operator = (Pin&&) = default;
my_always_inline ~Pin () = default;
/// Creates a temporary PinFields object for reading/writing hardware parameters.
my_always_inline PinFields fields () const;
private:
uint8_t m_iPort, m_iPin;
};
/**
* Helper function. Generates an integer whose bits "bit" - "bit+count-1" are set to '1' and the rest to '0'
*/
template <typename T>
my_always_inline constexpr T mask (const int count, int bit = 0) {
return ((T { 1 } << count) - T { 1 }) << bit;
}
template <typename RegType>
my_always_inline RegContainer<RegType>::RegContainer (uintptr_t addr) :
m_address (addr), m_loaded (false), m_cache (0), m_modMask (0) {
}
template <typename RegType>
my_always_inline RegContainer<RegType>::RegContainer (RegContainer&& src) :
m_address (src.m_address), m_loaded (src.m_loaded), m_cache (src.m_cache), m_modMask (src.m_modMask) {
src.m_modMask = 0;
}
template <typename RegType>
my_always_inline RegContainer<RegType>::~RegContainer () {
// Did we modify anything?
if (m_modMask != 0) {
RegType temp = 0;
// Did we not already load the register contents?
if (!m_loaded) {
// => Load it now
auto read = *reinterpret_cast<volatile RegType*> (m_address);
// Only keep the part that we did not modify
temp = read & ~m_modMask;
}
// Apply changes
temp = temp | m_cache;
// Perform the write access.
*reinterpret_cast<volatile RegType*> (m_address) = temp;
}
}
template <typename RegType>
my_always_inline RegType RegContainer<RegType>::getReg () {
// Did we not already load the register?
if (!m_loaded) {
// Perform the read access
auto read = *reinterpret_cast<volatile RegType*> (m_address);
// Keep the part of m_cache that we modified as it is, and overwrite the rest with the loaded value
m_cache = (read & ~m_modMask) | (m_cache & m_modMask);
m_loaded = true;
}
return m_cache;
}
template <typename RegType>
my_always_inline RegType RegContainer<RegType>::getField (uint8_t bit, uint8_t size) {
return (getReg () >> bit) & mask<RegType> (size);
}
template <typename RegType>
my_always_inline void RegContainer<RegType>::setField (uint8_t bit, uint8_t size, RegType value) {
// Calculate the bitmask for the part that we want to modify
auto mask = ::mask<RegType> (size, bit);
auto write = value << bit;
// Change the cache
m_cache = (m_cache & ~mask) | write;
// Remember which part of the cache we modified
m_modMask |= mask;
}
template <typename ParamType_, typename Container_, typename Object_>
my_always_inline constexpr ParamProxy<ParamType_, Container_, Object_>::ParamProxy (Container_ cont, Object_ obj, uint8_t bit, uint8_t size) :
// WTF
// m_cont (std::forward < Container > (cont)), m_object (std::forward < Object > (obj)), m_bit (bit), m_size (size) {
m_cont (cont), m_object (obj), m_bit (bit), m_size (size) {
}
template <typename ParamType, typename Container, typename Object>
my_always_inline Object ParamProxy<ParamType, Container, Object>::set (const ParamType& p) {
m_cont.setField (m_bit, m_size, p);
return m_object;
}
template <typename ParamType, typename Container, typename Object>
my_always_inline ParamType ParamProxy<ParamType, Container, Object>::get () {
return m_cont.getField (m_bit, m_size);
}
template <typename ParamType, typename Container, typename Object>
my_always_inline Object ParamProxy<ParamType, Container, Object>::get (ParamType& out) {
out = get ();
return m_object;
}
template <typename ParamType, typename Container, typename Object>
my_always_inline Object ParamProxy<ParamType, Container, Object>::operator () (const ParamType& p) {
return set (p);
}
template <typename ParamType, typename Container, typename Object>
my_always_inline ParamType ParamProxy<ParamType, Container, Object>::operator () () {
return get ();
}
template <typename ParamType, typename Container, typename Object>
my_always_inline ParamType ParamProxy<ParamType, Container, Object>::operator = (const ParamType& p) {
set (p);
return p;
}
template <typename ParamType, typename Container, typename Object>
my_always_inline ParamType ParamProxy<ParamType, Container, Object>::operator * () {
return get ();
}
my_always_inline constexpr Pin::Pin (uint8_t iPort, uint8_t iPin) :
m_iPort (iPort), m_iPin (iPin) {
}
my_always_inline PinFields Pin::fields () const {
return {m_iPort, m_iPin};
}
my_always_inline PinFields::PinFields (uint8_t iPort, uint8_t iPin) :
m_iPort (iPort), m_iPin (iPin), m_cCRL (0xDEADBEEF), m_cCRH (0x00C0FFEE),
param1 (m_cCRL, *this, 0, 4), param2 (m_cCRL, *this, 4, 4), out (m_cCRH, *this, 17, 1), in (m_cCRH, *this, 18, 1) {
}
my_always_inline PinFields::PinFields (PinFields&& src) :
m_iPort (src.m_iPort),
m_iPin (src.m_iPin),
m_cCRL (std::move (src.m_cCRL)),
m_cCRH (std::move (src.m_cCRH)),
param1 (m_cCRL, *this, 0, 4),
param2 (m_cCRL, *this, 4, 4),
out (m_cCRH, *this, 17, 1),
in (m_cCRH, *this, 18, 1) {
}
/*
* Example code
*/
// Get two certain pins
static constexpr auto inPin = Pin { 0, 3 };
static constexpr auto outPin = Pin { 5, 7 };
bool getInput () {
return inPin.fields ().in.get ();
}
void setOutput (bool x) {
outPin.fields ().out = x;
}
void configure () {
// Configure pin parameters...
inPin.fields ()
.param1.set (7)
.param2.set (3);
outPin.fields ()
.param1.set (2)
.param2.set (1);
}
int main () {
configure ();
while (true) {
setOutput (getInput ());
}
}
PREFIX=arm-none-eabi-
CXXFLAGS=-Wall -Wextra -std=c++11 -O3 -mcpu=cortex-m3 -mthumb
LDFLAGS =$(CXXFLAGS) -nostdlib
CXX=$(PREFIX)g++
OBJDUMP=$(PREFIX)objdump
test.S : test.elf
$(OBJDUMP) -C -d $< > $@
test.elf : main.o test.ld
$(CXX) $(LDFLAGS) -o $@ $< -T test.ld
main.o : main.cc
$(CXX) $(CXXFLAGS) -c -o $@ $<
clean :
rm test.S test.elf main.o
ENTRY(main)
/* Define output sections */
SECTIONS
{
/* The program code and other data goes into FLASH */
.text :
{
*(.text) /* .text sections (code) */
*(.text*) /* .text* sections (code) */
}
}