MIPS32: Fill branch delay slots

Test: booted MIPS32 in QEMU
Test: test-art-host-gtest
Test: test-art-target-gtest
Test: test-art-target-run-test-optimizing on CI20

Change-Id: I727e80753395ab99fff004cb5d2e0a06409150d7
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 434ca67..d50c439 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -154,6 +154,9 @@
       : Assembler(arena),
         overwriting_(false),
         overwrite_location_(0),
+        reordering_(true),
+        ds_fsm_state_(kExpectingLabel),
+        ds_fsm_target_pc_(0),
         literals_(arena->Adapter(kArenaAllocAssembler)),
         last_position_adjustment_(0),
         last_old_position_(0),
@@ -163,6 +166,7 @@
   }
 
   size_t CodeSize() const OVERRIDE { return Assembler::CodeSize(); }
+  size_t CodePosition() OVERRIDE;
   DebugFrameOpCodeWriterForAssembler& cfi() { return Assembler::cfi(); }
 
   virtual ~MipsAssembler() {
@@ -256,6 +260,11 @@
   void Slti(Register rt, Register rs, uint16_t imm16);
   void Sltiu(Register rt, Register rs, uint16_t imm16);
 
+  // Branches and jumps to immediate offsets/addresses do not take care of their
+  // delay/forbidden slots and generally should not be used directly. This applies
+  // to the following R2 and R6 branch/jump instructions with imm16, imm21, addr26
+  // offsets/addresses.
+  // Use branches/jumps to labels instead.
   void B(uint16_t imm16);
   void Bal(uint16_t imm16);
   void Beq(Register rs, Register rt, uint16_t imm16);
@@ -272,9 +281,13 @@
   void Bc1t(int cc, uint16_t imm16);  // R2
   void J(uint32_t addr26);
   void Jal(uint32_t addr26);
+  // Jalr() and Jr() fill their delay slots when reordering is enabled.
+  // When reordering is disabled, the delay slots must be filled manually.
+  // You may use NopIfNoReordering() to fill them when reordering is disabled.
   void Jalr(Register rd, Register rs);
   void Jalr(Register rs);
   void Jr(Register rs);
+  // Nal() does not fill its delay slot. It must be filled manually.
   void Nal();
   void Auipc(Register rs, uint16_t imm16);  // R6
   void Addiupc(Register rs, uint32_t imm19);  // R6
@@ -403,6 +416,7 @@
 
   void Break();
   void Nop();
+  void NopIfNoReordering();
   void Move(Register rd, Register rs);
   void Clear(Register rd);
   void Not(Register rd, Register rs);
@@ -414,7 +428,8 @@
   void LoadSConst32(FRegister r, int32_t value, Register temp);
   void Addiu32(Register rt, Register rs, int32_t value, Register rtmp = AT);
 
-  // These will generate R2 branches or R6 branches as appropriate.
+  // These will generate R2 branches or R6 branches as appropriate and take care of
+  // the delay/forbidden slots.
   void Bind(MipsLabel* label);
   void B(MipsLabel* label);
   void Bal(MipsLabel* label);
@@ -868,7 +883,51 @@
   };
   friend std::ostream& operator<<(std::ostream& os, const BranchCondition& rhs);
 
+  // Enables or disables instruction reordering (IOW, automatic filling of delay slots)
+  // similarly to ".set reorder" / ".set noreorder" in traditional MIPS assembly.
+  // Returns the last state, which may be useful for temporary enabling/disabling of
+  // reordering.
+  bool SetReorder(bool enable);
+
  private:
+  // Description of the last instruction in terms of input and output registers.
+  // Used to make the decision of moving the instruction into a delay slot.
+  struct DelaySlot {
+    DelaySlot();
+    // Encoded instruction that may be used to fill the delay slot or 0
+    // (0 conveniently represents NOP).
+    uint32_t instruction_;
+    // Mask of output GPRs for the instruction.
+    uint32_t gpr_outs_mask_;
+    // Mask of input GPRs for the instruction.
+    uint32_t gpr_ins_mask_;
+    // Mask of output FPRs for the instruction.
+    uint32_t fpr_outs_mask_;
+    // Mask of input FPRs for the instruction.
+    uint32_t fpr_ins_mask_;
+    // Mask of output FPU condition code flags for the instruction.
+    uint32_t cc_outs_mask_;
+    // Mask of input FPU condition code flags for the instruction.
+    uint32_t cc_ins_mask_;
+    // Branches never operate on the LO and HI registers, hence there's
+    // no mask for LO and HI.
+  };
+
+  // Delay slot finite state machine's (DS FSM's) state. The FSM state is updated
+  // upon every new instruction and label generated. The FSM detects instructions
+  // suitable for delay slots and immediately preceded with labels. These are target
+  // instructions for branches. If an unconditional R2 branch does not get its delay
+  // slot filled with the immediately preceding instruction, it may instead get the
+  // slot filled with the target instruction (the branch will need its offset
+  // incremented past the target instruction). We call this "absorption". The FSM
+  // records PCs of the target instructions suitable for this optimization.
+  enum DsFsmState {
+    kExpectingLabel,
+    kExpectingInstruction,
+    kExpectingCommit
+  };
+  friend std::ostream& operator<<(std::ostream& os, const DsFsmState& rhs);
+
   class Branch {
    public:
     enum Type {
@@ -910,6 +969,17 @@
     static constexpr uint32_t kUnresolved = 0xffffffff;  // Unresolved target_
     static constexpr int32_t kMaxBranchLength = 32;
     static constexpr int32_t kMaxBranchSize = kMaxBranchLength * sizeof(uint32_t);
+    // The following two instruction encodings can never legally occur in branch delay
+    // slots and are used as markers.
+    //
+    // kUnfilledDelaySlot means that the branch may use either the preceding or the target
+    // instruction to fill its delay slot (the latter is only possible with unconditional
+    // R2 branches and is termed here as "absorption").
+    static constexpr uint32_t kUnfilledDelaySlot = 0x10000000;  // beq zero, zero, 0.
+    // kUnfillableDelaySlot means that the branch cannot use an instruction (other than NOP)
+    // to fill its delay slot. This is only used for unconditional R2 branches to prevent
+    // absorption of the target instruction when reordering is disabled.
+    static constexpr uint32_t kUnfillableDelaySlot = 0x13FF0000;  // beq ra, ra, 0.
 
     struct BranchInfo {
       // Branch length as a number of 4-byte-long instructions.
@@ -958,6 +1028,8 @@
     uint32_t GetTarget() const;
     uint32_t GetLocation() const;
     uint32_t GetOldLocation() const;
+    uint32_t GetPrecedingInstructionLength(Type type) const;
+    uint32_t GetPrecedingInstructionSize(Type type) const;
     uint32_t GetLength() const;
     uint32_t GetOldLength() const;
     uint32_t GetSize() const;
@@ -967,6 +1039,12 @@
     bool IsLong() const;
     bool IsResolved() const;
 
+    // Various helpers for branch delay slot management.
+    bool CanHaveDelayedInstruction(const DelaySlot& delay_slot) const;
+    void SetDelayedInstruction(uint32_t instruction);
+    uint32_t GetDelayedInstruction() const;
+    void DecrementLocations();
+
     // Returns the bit size of the signed offset that the branch instruction can handle.
     OffsetBits GetOffsetSize() const;
 
@@ -1031,27 +1109,34 @@
     // Helper for the above.
     void InitShortOrLong(OffsetBits ofs_size, Type short_type, Type long_type);
 
-    uint32_t old_location_;      // Offset into assembler buffer in bytes.
-    uint32_t location_;          // Offset into assembler buffer in bytes.
-    uint32_t target_;            // Offset into assembler buffer in bytes.
+    uint32_t old_location_;         // Offset into assembler buffer in bytes.
+    uint32_t location_;             // Offset into assembler buffer in bytes.
+    uint32_t target_;               // Offset into assembler buffer in bytes.
 
-    uint32_t lhs_reg_;           // Left-hand side register in conditional branches or
-                                 // indirect call register.
-    uint32_t rhs_reg_;           // Right-hand side register in conditional branches.
-    BranchCondition condition_;  // Condition for conditional branches.
+    uint32_t lhs_reg_;              // Left-hand side register in conditional branches or
+                                    // FPU condition code. Destination register in literals.
+    uint32_t rhs_reg_;              // Right-hand side register in conditional branches.
+                                    // Base register in literals (ZERO on R6).
+    BranchCondition condition_;     // Condition for conditional branches.
 
-    Type type_;                  // Current type of the branch.
-    Type old_type_;              // Initial type of the branch.
+    Type type_;                     // Current type of the branch.
+    Type old_type_;                 // Initial type of the branch.
+
+    uint32_t delayed_instruction_;  // Encoded instruction for the delay slot or
+                                    // kUnfilledDelaySlot if none but fillable or
+                                    // kUnfillableDelaySlot if none and unfillable
+                                    // (the latter is only used for unconditional R2
+                                    // branches).
   };
   friend std::ostream& operator<<(std::ostream& os, const Branch::Type& rhs);
   friend std::ostream& operator<<(std::ostream& os, const Branch::OffsetBits& rhs);
 
-  void EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
-  void EmitI(int opcode, Register rs, Register rt, uint16_t imm);
-  void EmitI21(int opcode, Register rs, uint32_t imm21);
-  void EmitI26(int opcode, uint32_t imm26);
-  void EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd, int funct);
-  void EmitFI(int opcode, int fmt, FRegister rt, uint16_t imm);
+  uint32_t EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
+  uint32_t EmitI(int opcode, Register rs, Register rt, uint16_t imm);
+  uint32_t EmitI21(int opcode, Register rs, uint32_t imm21);
+  uint32_t EmitI26(int opcode, uint32_t imm26);
+  uint32_t EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd, int funct);
+  uint32_t EmitFI(int opcode, int fmt, FRegister rt, uint16_t imm);
   void EmitBcondR2(BranchCondition cond, Register rs, Register rt, uint16_t imm16);
   void EmitBcondR6(BranchCondition cond, Register rs, Register rt, uint32_t imm16_21);
 
@@ -1060,6 +1145,33 @@
   void Call(MipsLabel* label);
   void FinalizeLabeledBranch(MipsLabel* label);
 
+  // Various helpers for branch delay slot management.
+  void DsFsmInstr(uint32_t instruction,
+                  uint32_t gpr_outs_mask,
+                  uint32_t gpr_ins_mask,
+                  uint32_t fpr_outs_mask,
+                  uint32_t fpr_ins_mask,
+                  uint32_t cc_outs_mask,
+                  uint32_t cc_ins_mask);
+  void DsFsmInstrNop(uint32_t instruction);
+  void DsFsmInstrRrr(uint32_t instruction, Register out, Register in1, Register in2);
+  void DsFsmInstrRrrr(uint32_t instruction, Register in1_out, Register in2, Register in3);
+  void DsFsmInstrFff(uint32_t instruction, FRegister out, FRegister in1, FRegister in2);
+  void DsFsmInstrFfff(uint32_t instruction, FRegister in1_out, FRegister in2, FRegister in3);
+  void DsFsmInstrRf(uint32_t instruction, Register out, FRegister in);
+  void DsFsmInstrFr(uint32_t instruction, FRegister out, Register in);
+  void DsFsmInstrFR(uint32_t instruction, FRegister in1, Register in2);
+  void DsFsmInstrCff(uint32_t instruction, int cc_out, FRegister in1, FRegister in2);
+  void DsFsmInstrRrrc(uint32_t instruction, Register in1_out, Register in2, int cc_in);
+  void DsFsmInstrFffc(uint32_t instruction, FRegister in1_out, FRegister in2, int cc_in);
+  void DsFsmLabel();
+  void DsFsmCommitLabel();
+  void DsFsmDropLabel();
+  void MoveInstructionToDelaySlot(Branch& branch);
+  bool CanExchangeWithSlt(Register rs, Register rt) const;
+  void ExchangeWithSlt(const DelaySlot& forwarded_slot);
+  void GenerateSltForCondBranch(bool unsigned_slt, Register rs, Register rt);
+
   Branch* GetBranch(uint32_t branch_id);
   const Branch* GetBranch(uint32_t branch_id) const;
   uint32_t GetBranchLocationOrPcRelBase(const MipsAssembler::Branch* branch) const;
@@ -1100,6 +1212,17 @@
   // The current overwrite location.
   uint32_t overwrite_location_;
 
+  // Whether instruction reordering (IOW, automatic filling of delay slots) is enabled.
+  bool reordering_;
+  // Information about the last instruction that may be used to fill a branch delay slot.
+  DelaySlot delay_slot_;
+  // Delay slot FSM state.
+  DsFsmState ds_fsm_state_;
+  // PC of the current labeled target instruction.
+  uint32_t ds_fsm_target_pc_;
+  // PCs of labeled target instructions.
+  std::vector<uint32_t> ds_fsm_target_pcs_;
+
   // Use std::deque<> for literal labels to allow insertions at the end
   // without invalidating pointers and references to existing elements.
   ArenaDeque<Literal> literals_;
@@ -1109,7 +1232,7 @@
   // that PC (from NAL) points to.
   MipsLabel pc_rel_base_label_;
 
-  // Data for AdjustedPosition(), see the description there.
+  // Data for GetAdjustedPosition(), see the description there.
   uint32_t last_position_adjustment_;
   uint32_t last_old_position_;
   uint32_t last_branch_id_;