Implement a simple peephole optimization framework for ARM.

Implement some optimizations:

+ performing arithmetic by a small constant is 3 instructions shorter.
+ reading a local variables is 1 instruction shorter.
+ constant array indexing (e.g. p[5]) is 5 instructions shorter.
diff --git a/libacc/acc.cpp b/libacc/acc.cpp
index 81d30ad..62dea4c 100644
--- a/libacc/acc.cpp
+++ b/libacc/acc.cpp
@@ -53,6 +53,9 @@
 
 // #define PROVIDE_TRACE_CODEGEN
 
+// Uncomment to disable ARM peephole optimizations
+// #define DISABLE_ARM_PEEPHOLE
+
 // Uncomment to save input to a text file in DEBUG_DUMP_PATTERN
 // #define DEBUG_SAVE_INPUT_TO_FILE
 
@@ -185,7 +188,21 @@
         Type* pType;
     };
 
-    class CodeBuf {
+    class ICodeBuf {
+    public:
+        virtual ~ICodeBuf() {}
+        virtual void init(int size) = 0;
+        virtual void setErrorSink(ErrorSink* pErrorSink) = 0;
+        virtual void o4(int n) = 0;
+        virtual void ob(int n) = 0;
+        virtual void* getBase() = 0;
+        virtual intptr_t getSize() = 0;
+        virtual intptr_t getPC() = 0;
+        // Call this before trying to modify code in the buffer.
+        virtual void flush() = 0;
+    };
+
+    class CodeBuf : public ICodeBuf {
         char* ind; // Output code pointer
         char* pProgramBase;
         ErrorSink* mErrorSink;
@@ -220,52 +237,52 @@
             mOverflowed = false;
         }
 
-        ~CodeBuf() {
+        virtual ~CodeBuf() {
             release();
         }
 
-        void init(int size) {
+        virtual void init(int size) {
             release();
             mSize = size;
             pProgramBase = (char*) calloc(1, size);
             ind = pProgramBase;
         }
 
-        void setErrorSink(ErrorSink* pErrorSink) {
+        virtual void setErrorSink(ErrorSink* pErrorSink) {
             mErrorSink = pErrorSink;
         }
 
-        int o4(int n) {
+        virtual void o4(int n) {
             if(check(4)) {
-                return 0;
+                return;
             }
-            intptr_t result = (intptr_t) ind;
             * (int*) ind = n;
             ind += 4;
-            return result;
         }
 
         /*
          * Output a byte. Handles all values, 0..ff.
          */
-        void ob(int n) {
+        virtual void ob(int n) {
             if(check(1)) {
                 return;
             }
             *ind++ = n;
         }
 
-        inline void* getBase() {
+        virtual void* getBase() {
             return (void*) pProgramBase;
         }
 
-        intptr_t getSize() {
+        virtual intptr_t getSize() {
             return ind - pProgramBase;
         }
 
-        intptr_t getPC() {
+        virtual intptr_t getPC() {
             return (intptr_t) ind;
         }
+
+        virtual void flush() {}
     };
 
     /**
@@ -297,7 +314,7 @@
         }
         virtual ~CodeGenerator() {}
 
-        virtual void init(CodeBuf* pCodeBuf) {
+        virtual void init(ICodeBuf* pCodeBuf) {
             this->pCodeBuf = pCodeBuf;
             pCodeBuf->setErrorSink(mErrorSink);
         }
@@ -544,8 +561,8 @@
             pCodeBuf->ob(n);
         }
 
-        intptr_t o4(int data) {
-            return pCodeBuf->o4(data);
+        void o4(int data) {
+            pCodeBuf->o4(data);
         }
 
         intptr_t getBase() {
@@ -560,6 +577,10 @@
             return pCodeBuf->getSize();
         }
 
+        void flush() {
+            pCodeBuf->flush();
+        }
+
         void error(const char* fmt,...) {
             va_list ap;
             va_start(ap, fmt);
@@ -675,12 +696,225 @@
 
     private:
         Vector<ExpressionValue> mExpressionStack;
-        CodeBuf* pCodeBuf;
+        ICodeBuf* pCodeBuf;
         ErrorSink* mErrorSink;
     };
 
 #ifdef PROVIDE_ARM_CODEGEN
 
+    static size_t rotateRight(size_t n, size_t rotate) {
+        return (n >> rotate) | (n << (32 - rotate));
+    }
+
+    static size_t rotateLeft(size_t n, size_t rotate) {
+        return (n << rotate) | (n >> (32 - rotate));
+    }
+
+    static bool encode12BitImmediate(size_t immediate, size_t* pResult) {
+        for(size_t i = 0; i < 16; i++) {
+            size_t rotate = i * 2;
+            size_t mask = rotateRight(0xff, rotate);
+            if ((immediate | mask) == mask) {
+                size_t bits8 = rotateLeft(immediate, rotate);
+                // assert(bits8 <= 0xff);
+                *pResult = (i << 8) | bits8;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static size_t decode12BitImmediate(size_t immediate) {
+        size_t data = immediate & 0xff;
+        size_t rotate = 2 * ((immediate >> 8) & 0xf);
+        return rotateRight(data, rotate);
+    }
+
+    class ARMCodeBuf : public ICodeBuf {
+        ICodeBuf* mpBase;
+        ErrorSink* mErrorSink;
+
+        class CircularQueue {
+            static const int SIZE = 16; // Must be power of 2
+            static const int MASK = SIZE-1;
+            unsigned int mBuf[SIZE];
+            int mHead;
+            int mCount;
+
+        public:
+            CircularQueue() {
+                mHead = 0;
+                mCount = 0;
+            }
+
+            void pushBack(unsigned int data) {
+                mBuf[(mHead + mCount) & MASK] = data;
+                mCount += 1;
+            }
+
+            unsigned int popFront() {
+                unsigned int result = mBuf[mHead];
+                mHead = (mHead + 1) & MASK;
+                mCount -= 1;
+                return result;
+            }
+
+            void popBack(int n) {
+                mCount -= n;
+            }
+
+            inline int count() {
+                return mCount;
+            }
+
+            bool empty() {
+                return mCount == 0;
+            }
+
+            bool full() {
+                return mCount == SIZE;
+            }
+
+            // The valid indexes are 1 - count() to 0
+            unsigned int operator[](int i) {
+                return mBuf[(mHead + mCount + i) & MASK];
+            }
+        };
+
+        CircularQueue mQ;
+
+        void error(const char* fmt,...) {
+            va_list ap;
+            va_start(ap, fmt);
+            mErrorSink->verror(fmt, ap);
+            va_end(ap);
+        }
+
+        void flush() {
+            while (!mQ.empty()) {
+                mpBase->o4(mQ.popFront());
+            }
+            mpBase->flush();
+        }
+
+    public:
+        ARMCodeBuf(ICodeBuf* pBase) {
+            mpBase = pBase;
+        }
+
+        virtual ~ARMCodeBuf() {
+            delete mpBase;
+        }
+
+        void init(int size) {
+            mpBase->init(size);
+        }
+
+        void setErrorSink(ErrorSink* pErrorSink) {
+            mErrorSink = pErrorSink;
+            mpBase->setErrorSink(pErrorSink);
+        }
+
+        void o4(int n) {
+            if (mQ.full()) {
+                mpBase->o4(mQ.popFront());
+            }
+            mQ.pushBack(n);
+
+#ifndef DISABLE_ARM_PEEPHOLE
+            // Peephole check
+            bool didPeep;
+            do {
+                const unsigned int opMask = 0x01e00000;
+                const unsigned int immediateMask = 0x00000fff;
+                didPeep = false;
+                if (mQ.count() >= 4) {
+
+                    // Operand by a small constant
+                    // push;mov #imm;pop;op ==> op #imm
+
+                    if (mQ[-4] == 0xe92d0001  && // stmfd	r13!, {r0}
+                        (mQ[-3] & ~immediateMask) == 0xe3a00000  && // mov	r0, #X
+                        mQ[-2] == 0xe8bd0002  && // ldmea	r13!, {r1}
+                        (mQ[-1] & ~opMask) == (0xe0810000 & ~opMask)) {  // OP	r0, r1, r0
+                        unsigned int movConst = mQ[-3];
+                        unsigned int op = mQ[-1];
+                        unsigned int combined = 0xe2000000 | (op & opMask) | (movConst & immediateMask);
+                        // fprintf(stderr, "op %x movConst %x combined %x\n", op, movConst, combined);
+                        if (! (combined == 0xe2800000 || combined == 0xe2400000)) { // add/sub #0
+                            mQ.popBack(4);
+                            mQ.pushBack(combined);
+                            didPeep = true;
+                        } else {
+                            mQ.popBack(4);
+                            didPeep = true;
+                        }
+                    }
+                }
+
+                // Load local variable
+                // sub r0,r11,#imm;ldr r0,[r0]  ==> ldr r0, [r11,#-imm]
+                if (mQ.count() >= 2) {
+                    if ((mQ[-2] & ~immediateMask) == 0xe24b0000 && // sub r0,r11,#imm
+                        mQ[-1] == 0xe5900000) { // ldr r0, [r0]
+                        unsigned int op = mQ[-2];
+                        unsigned int ld = mQ[-1];
+                        unsigned int combined = (op & immediateMask) | 0xE51B0000; // ldr r0, [r11, #-0]
+                        mQ.popBack(2);
+                        mQ.pushBack(combined);
+                        didPeep = true;
+                    }
+                }
+
+                // Constant array lookup
+
+                if (mQ.count() >= 6 &&
+                    mQ[-6] == 0xe92d0001 && // stmfd r13!, {r0}
+                    (mQ[-5] & ~immediateMask)== 0xe3a00000 && // mov r0, #0x00000001
+                    mQ[-4] == 0xe8bd0002 && // ldmea r13!, {r1}
+                    (mQ[-3] & ~immediateMask)== 0xe3a02000 && // mov r2, #0x00000004
+                    mQ[-2] == 0xe0000092 && // mul r0, r2, r0
+                    mQ[-1] == 0xe0810000) { // add r0, r1, r0
+                    unsigned int mov1 = mQ[-5];
+                    unsigned int mov2 = mQ[-3];
+                    unsigned int const1 = decode12BitImmediate(mov1);
+                    unsigned int const2 = decode12BitImmediate(mov2);
+                    unsigned int comboConst = const1 * const2;
+                    size_t immediate = 0;
+                    if (encode12BitImmediate(comboConst, &immediate)) {
+                        mQ.popBack(6);
+                        unsigned int add = immediate | 0xE2800000; // add r0, r0, #n
+                        if (comboConst) {
+                            mQ.pushBack(add);
+                        }
+                        didPeep = true;
+                    }
+                }
+
+            } while (didPeep);
+#endif
+        }
+
+        void ob(int n) {
+            error("ob() not supported.");
+        }
+
+        void* getBase() {
+            flush();
+            return mpBase->getBase();
+        }
+
+        intptr_t getSize() {
+            flush();
+            return mpBase->getSize();
+        }
+
+        intptr_t getPC() {
+            flush();
+            return mpBase->getPC();
+        }
+    };
+
     class ARMCodeGenerator : public CodeGenerator {
     public:
         ARMCodeGenerator() {
@@ -710,10 +944,12 @@
             // sp, fp -> oldfp, retadr, arg0 arg1 ....
             o4(0xE1A0B00D); // mov    fp, sp
             LOG_STACK("functionEntry: %d\n", mStackUse);
-            return o4(0xE24DD000); // sub    sp, sp, # <local variables>
+            int pc = getPC();
+            o4(0xE24DD000); // sub    sp, sp, # <local variables>
             // We don't know how many local variables we are going to use,
             // but we will round the allocation up to a multiple of
             // STACK_ALIGNMENT, so it won't affect the stack alignment.
+            return pc;
         }
 
         virtual void functionExit(Type* pDecl, int localVariableAddress, int localVariableSize) {
@@ -810,7 +1046,9 @@
         }
 
         virtual int gjmp(int t) {
-            return o4(0xEA000000 | encodeAddress(t)); // b .L33
+            int pc = getPC();
+            o4(0xEA000000 | encodeAddress(t)); // b .L33
+            return pc;
         }
 
         /* l = 0: je, l == 1: jne */
@@ -841,7 +1079,9 @@
                     break;
             }
             int branch = l ? 0x1A000000 : 0x0A000000; // bne : beq
-            return o4(branch | encodeAddress(t));
+            int pc = getPC();
+            o4(branch | encodeAddress(t));
+            return pc;
         }
 
         virtual void gcmp(int op) {
@@ -1476,7 +1716,8 @@
 
             if (ea == 0) {
                 o4(0xEA000000); //        b .L99
-                result = o4(ea);         // .L1:   .word 0
+                result = getPC();
+                o4(ea);         // .L1:   .word 0
                             // .L99:
             }
             return result;
@@ -1566,7 +1807,9 @@
         }
 
         virtual int beginFunctionCallArguments() {
-            return o4(0xE24DDF00); // Placeholder
+            int pc = getPC();
+            o4(0xE24DDF00); // Placeholder sub  sp, sp, #0
+            return pc;
         }
 
         virtual size_t storeR0ToArg(int l, Type* pArgType) {
@@ -1652,6 +1895,7 @@
             if (l < 0 || l > 0x3FC) {
                 error("L out of range for stack adjustment: 0x%08x", l);
             }
+            flush();
             * (int*) a = 0xE24DDF00 | (l >> 2); // sub    sp, sp, #0 << 2
             mStackUse += mStackAlignmentAdjustment;
             LOG_STACK("endFunctionCallArguments mStackUse: %d, mStackAlignmentAdjustment %d\n",
@@ -1661,7 +1905,9 @@
         virtual int callForward(int symbol, Type* pFunc) {
             setR0Type(pFunc->pHead);
             // Forward calls are always short (local)
-            return o4(0xEB000000 | encodeAddress(symbol));
+            int pc = getPC();
+            o4(0xEB000000 | encodeAddress(symbol));
+            return pc;
         }
 
         virtual void callIndirect(int l, Type* pFunc) {
@@ -1975,32 +2221,10 @@
             }
         }
 
-        bool encode12BitImmediate(size_t immediate, size_t* pResult) {
-            for(size_t i = 0; i < 16; i++) {
-                size_t rotate = i * 2;
-                size_t mask = rotateRight(0xff, rotate);
-                if ((immediate | mask) == mask) {
-                    size_t bits8 = rotateLeft(immediate, rotate);
-                    assert(bits8 <= 0xff);
-                    *pResult = (i << 8) | bits8;
-                    return true;
-                }
-            }
-            return false;
-        }
-
         void incompatibleTypes(Type* pR0Type, Type* pType) {
             error("Incompatible types old: %d new: %d", pR0Type->tag, pType->tag);
         }
 
-        size_t rotateRight(size_t n, size_t rotate) {
-            return (n >> rotate) | (n << (32 - rotate));
-        }
-
-        size_t rotateLeft(size_t n, size_t rotate) {
-            return (n << rotate) | (n >> (32 - rotate));
-        }
-
         void callRuntime(void* fn) {
             o4(0xE59FC000); // ldr    r12, .L1
             o4(0xEA000000); // b      .L99
@@ -2910,7 +3134,7 @@
             delete mpBase;
         }
 
-        virtual void init(CodeBuf* pCodeBuf) {
+        virtual void init(ICodeBuf* pCodeBuf) {
             mpBase->init(pCodeBuf);
         }
 
@@ -3683,7 +3907,7 @@
     int mLineNumber;
     bool mbBumpLine;
 
-    CodeBuf codeBuf;
+    ICodeBuf* pCodeBuf;
     CodeGenerator* pGen;
 
     String mErrorBuf;
@@ -4722,13 +4946,13 @@
             next();
             skip('(');
             if (t == TOK_WHILE) {
-                n = codeBuf.getPC(); // top of loop, target of "next" iteration
+                n = pCodeBuf->getPC(); // top of loop, target of "next" iteration
                 a = test_expr();
             } else {
                 if (tok != ';')
                     commaExpr();
                 skip(';');
-                n = codeBuf.getPC();
+                n = pCodeBuf->getPC();
                 a = 0;
                 if (tok != ';')
                     a = test_expr();
@@ -4736,14 +4960,14 @@
                 if (tok != ')') {
                     t = pGen->gjmp(0);
                     commaExpr();
-                    pGen->gjmp(n - codeBuf.getPC() - pGen->jumpOffset());
+                    pGen->gjmp(n - pCodeBuf->getPC() - pGen->jumpOffset());
                     pGen->gsym(t);
                     n = t + 4;
                 }
             }
             skip(')');
             block((intptr_t) &a, false);
-            pGen->gjmp(n - codeBuf.getPC() - pGen->jumpOffset()); /* jmp */
+            pGen->gjmp(n - pCodeBuf->getPC() - pGen->jumpOffset()); /* jmp */
             pGen->gsym(a);
         } else if (tok == '{') {
             if (! outermostFunctionBlock) {
@@ -5469,7 +5693,7 @@
                         /* patch forward references */
                         pGen->resolveForward((int) name->pForward);
                         /* put function address */
-                        name->pAddress = (void*) codeBuf.getPC();
+                        name->pAddress = (void*) pCodeBuf->getPC();
                     }
                     // Calculate stack offsets for parameters
                     mLocals.pushLevel();
@@ -5536,6 +5760,10 @@
             delete pGen;
             pGen = 0;
         }
+        if (pCodeBuf) {
+            delete pCodeBuf;
+            pCodeBuf = 0;
+        }
         if (file) {
             delete file;
             file = 0;
@@ -5560,6 +5788,7 @@
         dch = 0;
         file = 0;
         pGlobalBase = 0;
+        pCodeBuf = 0;
         pGen = 0;
         mPragmaStringCount = 0;
         mCompileResult = 0;
@@ -5572,10 +5801,14 @@
         delete pGen;
         pGen = 0;
 
+        delete pCodeBuf;
+        pCodeBuf = new CodeBuf();
+
         if (architecture != NULL) {
 #ifdef PROVIDE_ARM_CODEGEN
             if (! pGen && strcmp(architecture, "arm") == 0) {
                 pGen = new ARMCodeGenerator();
+                pCodeBuf = new ARMCodeBuf(pCodeBuf);
             }
 #endif
 #ifdef PROVIDE_X86_CODEGEN
@@ -5591,6 +5824,7 @@
         if (pGen == NULL) {
 #if defined(DEFAULT_ARM_CODEGEN)
             pGen = new ARMCodeGenerator();
+            pCodeBuf = new ARMCodeBuf(pCodeBuf);
 #elif defined(DEFAULT_X86_CODEGEN)
             pGen = new X86CodeGenerator();
 #endif
@@ -5639,7 +5873,6 @@
         mLocals.setTokenTable(&mTokenTable);
 
         internKeywords();
-        codeBuf.init(ALLOC_SIZE);
         setArchitecture(NULL);
         if (!pGen) {
             return -1;
@@ -5647,8 +5880,12 @@
 #ifdef PROVIDE_TRACE_CODEGEN
             pGen = new TraceCodeGenerator(pGen);
 #endif
-            pGen->setErrorSink(this);
-        pGen->init(&codeBuf);
+        pGen->setErrorSink(this);
+
+        if (pCodeBuf) {
+            pCodeBuf->init(ALLOC_SIZE);
+        }
+        pGen->init(pCodeBuf);
         file = new TextInputStream(text, textLength);
         pGlobalBase = (char*) calloc(1, ALLOC_SIZE);
         glo = pGlobalBase;
@@ -5731,8 +5968,8 @@
     }
 
     void getProgramBinary(ACCvoid** base, ACCsizei* length) {
-        *base = codeBuf.getBase();
-        *length = (ACCsizei) codeBuf.getSize();
+        *base = pCodeBuf->getBase();
+        *length = (ACCsizei) pCodeBuf->getSize();
     }
 
     char* getErrorMessage() {