MIPS32: Fill branch delay slots
Test: booted MIPS32 in QEMU
Test: test-art-host-gtest
Test: test-art-target-gtest
Test: test-art-target-run-test-optimizing on CI20
Change-Id: I727e80753395ab99fff004cb5d2e0a06409150d7
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 434ca67..d50c439 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -154,6 +154,9 @@
: Assembler(arena),
overwriting_(false),
overwrite_location_(0),
+ reordering_(true),
+ ds_fsm_state_(kExpectingLabel),
+ ds_fsm_target_pc_(0),
literals_(arena->Adapter(kArenaAllocAssembler)),
last_position_adjustment_(0),
last_old_position_(0),
@@ -163,6 +166,7 @@
}
size_t CodeSize() const OVERRIDE { return Assembler::CodeSize(); }
+ size_t CodePosition() OVERRIDE;
DebugFrameOpCodeWriterForAssembler& cfi() { return Assembler::cfi(); }
virtual ~MipsAssembler() {
@@ -256,6 +260,11 @@
void Slti(Register rt, Register rs, uint16_t imm16);
void Sltiu(Register rt, Register rs, uint16_t imm16);
+ // Branches and jumps to immediate offsets/addresses do not take care of their
+ // delay/forbidden slots and generally should not be used directly. This applies
+ // to the following R2 and R6 branch/jump instructions with imm16, imm21, addr26
+ // offsets/addresses.
+ // Use branches/jumps to labels instead.
void B(uint16_t imm16);
void Bal(uint16_t imm16);
void Beq(Register rs, Register rt, uint16_t imm16);
@@ -272,9 +281,13 @@
void Bc1t(int cc, uint16_t imm16); // R2
void J(uint32_t addr26);
void Jal(uint32_t addr26);
+ // Jalr() and Jr() fill their delay slots when reordering is enabled.
+ // When reordering is disabled, the delay slots must be filled manually.
+ // You may use NopIfNoReordering() to fill them when reordering is disabled.
void Jalr(Register rd, Register rs);
void Jalr(Register rs);
void Jr(Register rs);
+ // Nal() does not fill its delay slot. It must be filled manually.
void Nal();
void Auipc(Register rs, uint16_t imm16); // R6
void Addiupc(Register rs, uint32_t imm19); // R6
@@ -403,6 +416,7 @@
void Break();
void Nop();
+ void NopIfNoReordering();
void Move(Register rd, Register rs);
void Clear(Register rd);
void Not(Register rd, Register rs);
@@ -414,7 +428,8 @@
void LoadSConst32(FRegister r, int32_t value, Register temp);
void Addiu32(Register rt, Register rs, int32_t value, Register rtmp = AT);
- // These will generate R2 branches or R6 branches as appropriate.
+ // These will generate R2 branches or R6 branches as appropriate and take care of
+ // the delay/forbidden slots.
void Bind(MipsLabel* label);
void B(MipsLabel* label);
void Bal(MipsLabel* label);
@@ -868,7 +883,51 @@
};
friend std::ostream& operator<<(std::ostream& os, const BranchCondition& rhs);
+ // Enables or disables instruction reordering (IOW, automatic filling of delay slots)
+ // similarly to ".set reorder" / ".set noreorder" in traditional MIPS assembly.
+ // Returns the last state, which may be useful for temporary enabling/disabling of
+ // reordering.
+ bool SetReorder(bool enable);
+
private:
+ // Description of the last instruction in terms of input and output registers.
+ // Used to make the decision of moving the instruction into a delay slot.
+ struct DelaySlot {
+ DelaySlot();
+ // Encoded instruction that may be used to fill the delay slot or 0
+ // (0 conveniently represents NOP).
+ uint32_t instruction_;
+ // Mask of output GPRs for the instruction.
+ uint32_t gpr_outs_mask_;
+ // Mask of input GPRs for the instruction.
+ uint32_t gpr_ins_mask_;
+ // Mask of output FPRs for the instruction.
+ uint32_t fpr_outs_mask_;
+ // Mask of input FPRs for the instruction.
+ uint32_t fpr_ins_mask_;
+ // Mask of output FPU condition code flags for the instruction.
+ uint32_t cc_outs_mask_;
+ // Mask of input FPU condition code flags for the instruction.
+ uint32_t cc_ins_mask_;
+ // Branches never operate on the LO and HI registers, hence there's
+ // no mask for LO and HI.
+ };
+
+ // Delay slot finite state machine's (DS FSM's) state. The FSM state is updated
+ // upon every new instruction and label generated. The FSM detects instructions
+ // suitable for delay slots and immediately preceded with labels. These are target
+ // instructions for branches. If an unconditional R2 branch does not get its delay
+ // slot filled with the immediately preceding instruction, it may instead get the
+ // slot filled with the target instruction (the branch will need its offset
+ // incremented past the target instruction). We call this "absorption". The FSM
+ // records PCs of the target instructions suitable for this optimization.
+ enum DsFsmState {
+ kExpectingLabel,
+ kExpectingInstruction,
+ kExpectingCommit
+ };
+ friend std::ostream& operator<<(std::ostream& os, const DsFsmState& rhs);
+
class Branch {
public:
enum Type {
@@ -910,6 +969,17 @@
static constexpr uint32_t kUnresolved = 0xffffffff; // Unresolved target_
static constexpr int32_t kMaxBranchLength = 32;
static constexpr int32_t kMaxBranchSize = kMaxBranchLength * sizeof(uint32_t);
+ // The following two instruction encodings can never legally occur in branch delay
+ // slots and are used as markers.
+ //
+ // kUnfilledDelaySlot means that the branch may use either the preceding or the target
+ // instruction to fill its delay slot (the latter is only possible with unconditional
+ // R2 branches and is termed here as "absorption").
+ static constexpr uint32_t kUnfilledDelaySlot = 0x10000000; // beq zero, zero, 0.
+ // kUnfillableDelaySlot means that the branch cannot use an instruction (other than NOP)
+ // to fill its delay slot. This is only used for unconditional R2 branches to prevent
+ // absorption of the target instruction when reordering is disabled.
+ static constexpr uint32_t kUnfillableDelaySlot = 0x13FF0000; // beq ra, ra, 0.
struct BranchInfo {
// Branch length as a number of 4-byte-long instructions.
@@ -958,6 +1028,8 @@
uint32_t GetTarget() const;
uint32_t GetLocation() const;
uint32_t GetOldLocation() const;
+ uint32_t GetPrecedingInstructionLength(Type type) const;
+ uint32_t GetPrecedingInstructionSize(Type type) const;
uint32_t GetLength() const;
uint32_t GetOldLength() const;
uint32_t GetSize() const;
@@ -967,6 +1039,12 @@
bool IsLong() const;
bool IsResolved() const;
+ // Various helpers for branch delay slot management.
+ bool CanHaveDelayedInstruction(const DelaySlot& delay_slot) const;
+ void SetDelayedInstruction(uint32_t instruction);
+ uint32_t GetDelayedInstruction() const;
+ void DecrementLocations();
+
// Returns the bit size of the signed offset that the branch instruction can handle.
OffsetBits GetOffsetSize() const;
@@ -1031,27 +1109,34 @@
// Helper for the above.
void InitShortOrLong(OffsetBits ofs_size, Type short_type, Type long_type);
- uint32_t old_location_; // Offset into assembler buffer in bytes.
- uint32_t location_; // Offset into assembler buffer in bytes.
- uint32_t target_; // Offset into assembler buffer in bytes.
+ uint32_t old_location_; // Offset into assembler buffer in bytes.
+ uint32_t location_; // Offset into assembler buffer in bytes.
+ uint32_t target_; // Offset into assembler buffer in bytes.
- uint32_t lhs_reg_; // Left-hand side register in conditional branches or
- // indirect call register.
- uint32_t rhs_reg_; // Right-hand side register in conditional branches.
- BranchCondition condition_; // Condition for conditional branches.
+ uint32_t lhs_reg_; // Left-hand side register in conditional branches or
+ // FPU condition code. Destination register in literals.
+ uint32_t rhs_reg_; // Right-hand side register in conditional branches.
+ // Base register in literals (ZERO on R6).
+ BranchCondition condition_; // Condition for conditional branches.
- Type type_; // Current type of the branch.
- Type old_type_; // Initial type of the branch.
+ Type type_; // Current type of the branch.
+ Type old_type_; // Initial type of the branch.
+
+ uint32_t delayed_instruction_; // Encoded instruction for the delay slot or
+ // kUnfilledDelaySlot if none but fillable or
+ // kUnfillableDelaySlot if none and unfillable
+ // (the latter is only used for unconditional R2
+ // branches).
};
friend std::ostream& operator<<(std::ostream& os, const Branch::Type& rhs);
friend std::ostream& operator<<(std::ostream& os, const Branch::OffsetBits& rhs);
- void EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
- void EmitI(int opcode, Register rs, Register rt, uint16_t imm);
- void EmitI21(int opcode, Register rs, uint32_t imm21);
- void EmitI26(int opcode, uint32_t imm26);
- void EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd, int funct);
- void EmitFI(int opcode, int fmt, FRegister rt, uint16_t imm);
+ uint32_t EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
+ uint32_t EmitI(int opcode, Register rs, Register rt, uint16_t imm);
+ uint32_t EmitI21(int opcode, Register rs, uint32_t imm21);
+ uint32_t EmitI26(int opcode, uint32_t imm26);
+ uint32_t EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd, int funct);
+ uint32_t EmitFI(int opcode, int fmt, FRegister rt, uint16_t imm);
void EmitBcondR2(BranchCondition cond, Register rs, Register rt, uint16_t imm16);
void EmitBcondR6(BranchCondition cond, Register rs, Register rt, uint32_t imm16_21);
@@ -1060,6 +1145,33 @@
void Call(MipsLabel* label);
void FinalizeLabeledBranch(MipsLabel* label);
+ // Various helpers for branch delay slot management.
+ void DsFsmInstr(uint32_t instruction,
+ uint32_t gpr_outs_mask,
+ uint32_t gpr_ins_mask,
+ uint32_t fpr_outs_mask,
+ uint32_t fpr_ins_mask,
+ uint32_t cc_outs_mask,
+ uint32_t cc_ins_mask);
+ void DsFsmInstrNop(uint32_t instruction);
+ void DsFsmInstrRrr(uint32_t instruction, Register out, Register in1, Register in2);
+ void DsFsmInstrRrrr(uint32_t instruction, Register in1_out, Register in2, Register in3);
+ void DsFsmInstrFff(uint32_t instruction, FRegister out, FRegister in1, FRegister in2);
+ void DsFsmInstrFfff(uint32_t instruction, FRegister in1_out, FRegister in2, FRegister in3);
+ void DsFsmInstrRf(uint32_t instruction, Register out, FRegister in);
+ void DsFsmInstrFr(uint32_t instruction, FRegister out, Register in);
+ void DsFsmInstrFR(uint32_t instruction, FRegister in1, Register in2);
+ void DsFsmInstrCff(uint32_t instruction, int cc_out, FRegister in1, FRegister in2);
+ void DsFsmInstrRrrc(uint32_t instruction, Register in1_out, Register in2, int cc_in);
+ void DsFsmInstrFffc(uint32_t instruction, FRegister in1_out, FRegister in2, int cc_in);
+ void DsFsmLabel();
+ void DsFsmCommitLabel();
+ void DsFsmDropLabel();
+ void MoveInstructionToDelaySlot(Branch& branch);
+ bool CanExchangeWithSlt(Register rs, Register rt) const;
+ void ExchangeWithSlt(const DelaySlot& forwarded_slot);
+ void GenerateSltForCondBranch(bool unsigned_slt, Register rs, Register rt);
+
Branch* GetBranch(uint32_t branch_id);
const Branch* GetBranch(uint32_t branch_id) const;
uint32_t GetBranchLocationOrPcRelBase(const MipsAssembler::Branch* branch) const;
@@ -1100,6 +1212,17 @@
// The current overwrite location.
uint32_t overwrite_location_;
+ // Whether instruction reordering (IOW, automatic filling of delay slots) is enabled.
+ bool reordering_;
+ // Information about the last instruction that may be used to fill a branch delay slot.
+ DelaySlot delay_slot_;
+ // Delay slot FSM state.
+ DsFsmState ds_fsm_state_;
+ // PC of the current labeled target instruction.
+ uint32_t ds_fsm_target_pc_;
+ // PCs of labeled target instructions.
+ std::vector<uint32_t> ds_fsm_target_pcs_;
+
// Use std::deque<> for literal labels to allow insertions at the end
// without invalidating pointers and references to existing elements.
ArenaDeque<Literal> literals_;
@@ -1109,7 +1232,7 @@
// that PC (from NAL) points to.
MipsLabel pc_rel_base_label_;
- // Data for AdjustedPosition(), see the description there.
+ // Data for GetAdjustedPosition(), see the description there.
uint32_t last_position_adjustment_;
uint32_t last_old_position_;
uint32_t last_branch_id_;