Compiler constant handling rework

In preparation for de-optimization, reworked the constant
handling mechanism.  Also took advantage of knowledge of
constant operands (particularly for long operations).

Significant performance improvements for Mandelbrot
(~60 seconds to ~34 seconds).  Minor improvements in other
benchmarks.

The new constant handling breaks two of the existing
optimization passes: "Skip Large Method" and "Load/Store
Elimization."

I don't intend to update the large method optimization
because it will be superceeded by the upcoming interpreter/
fingerprinting mechanism.  Leaving the code in place for
now in order to compare compile-time improvements with
fingerprinting/interpret.  All related code will be deleted
when that is complete.

The load/store elimination pass needs some rework to handle
uses of multiple-register loads and stores.  It will be
updated & restored in a future CL.

Change-Id: Ia979abaf51b8ae81bbb0428031cbcea854625fac
diff --git a/src/compiler/codegen/arm/int_arm.cc b/src/compiler/codegen/arm/int_arm.cc
index fcf74f1..5a9786c 100644
--- a/src/compiler/codegen/arm/int_arm.cc
+++ b/src/compiler/codegen/arm/int_arm.cc
@@ -121,16 +121,81 @@
   branch3->target = branch1->target;
 }
 
-void ArmCodegen::GenFusedLongCmpBranch(CompilationUnit* cu, BasicBlock* bb, MIR* mir)
+void ArmCodegen::GenFusedLongCmpImmBranch(CompilationUnit* cu, BasicBlock* bb, RegLocation rl_src1,
+                                          int64_t val, ConditionCode ccode)
 {
+  int32_t val_lo = Low32Bits(val);
+  int32_t val_hi = High32Bits(val);
+  DCHECK(ModifiedImmediate(val_lo) >= 0);
+  DCHECK(ModifiedImmediate(val_hi) >= 0);
   LIR* label_list = cu->block_label_list;
   LIR* taken = &label_list[bb->taken->id];
   LIR* not_taken = &label_list[bb->fall_through->id];
+  rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
+  int32_t low_reg = rl_src1.low_reg;
+  int32_t high_reg = rl_src1.high_reg;
+
+  switch(ccode) {
+    case kCondEq:
+      OpCmpImmBranch(cu, kCondNe, high_reg, val_hi, not_taken);
+      break;
+    case kCondNe:
+      OpCmpImmBranch(cu, kCondNe, high_reg, val_hi, taken);
+      break;
+    case kCondLt:
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, not_taken);
+      ccode = kCondCc;
+      break;
+    case kCondLe:
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, not_taken);
+      ccode = kCondLs;
+      break;
+    case kCondGt:
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, not_taken);
+      ccode = kCondHi;
+      break;
+    case kCondGe:
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, not_taken);
+      ccode = kCondCs;
+      break;
+    default:
+      LOG(FATAL) << "Unexpected ccode: " << ccode;
+  }
+  OpCmpImmBranch(cu, ccode, low_reg, val_lo, taken);
+}
+
+
+void ArmCodegen::GenFusedLongCmpBranch(CompilationUnit* cu, BasicBlock* bb, MIR* mir)
+{
   RegLocation rl_src1 = GetSrcWide(cu, mir, 0);
   RegLocation rl_src2 = GetSrcWide(cu, mir, 2);
+  // Normalize such that if either operand is constant, src2 will be constant.
+  ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
+  if (rl_src1.is_const) {
+    RegLocation rl_temp = rl_src1;
+    rl_src1 = rl_src2;
+    rl_src2 = rl_temp;
+    ccode = FlipComparisonOrder(ccode);
+  }
+  if (rl_src2.is_const) {
+    RegLocation rl_temp = UpdateLocWide(cu, rl_src2);
+    // Do special compare/branch against simple const operand if not already in registers.
+    int64_t val = ConstantValueWide(cu, rl_src2);
+    if ((rl_temp.location != kLocPhysReg) &&
+        ((ModifiedImmediate(Low32Bits(val)) >= 0) && (ModifiedImmediate(High32Bits(val)) >= 0))) {
+      GenFusedLongCmpImmBranch(cu, bb, rl_src1, val, ccode);
+      return;
+    }
+  }
+  LIR* label_list = cu->block_label_list;
+  LIR* taken = &label_list[bb->taken->id];
+  LIR* not_taken = &label_list[bb->fall_through->id];
   rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
   rl_src2 = LoadValueWide(cu, rl_src2, kCoreReg);
-  ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
   OpRegReg(cu, kOpCmp, rl_src1.high_reg, rl_src2.high_reg);
   switch(ccode) {
     case kCondEq:
@@ -185,7 +250,7 @@
     if (ARM_LOWREG(reg) && ((check_value & 0xff) == check_value)) {
       NewLIR2(cu, kThumbCmpRI8, reg, check_value);
     } else if (mod_imm >= 0) {
-      NewLIR2(cu, kThumb2CmpRI8, reg, mod_imm);
+      NewLIR2(cu, kThumb2CmpRI12, reg, mod_imm);
     } else {
       int t_reg = AllocTemp(cu);
       LoadConstant(cu, t_reg, check_value);
@@ -523,6 +588,93 @@
   return false;
 }
 
+
+ /*
+  * Check to see if a result pair has a misaligned overlap with an operand pair.  This
+  * is not usual for dx to generate, but it is legal (for now).  In a future rev of
+  * dex, we'll want to make this case illegal.
+  */
+static bool BadOverlap(CompilationUnit* cu, RegLocation rl_src, RegLocation rl_dest)
+{
+  DCHECK(rl_src.wide);
+  DCHECK(rl_dest.wide);
+  return (abs(SRegToVReg(cu, rl_src.s_reg_low) - SRegToVReg(cu, rl_dest.s_reg_low)) == 1);
+}
+
+void ArmCodegen::GenMulLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
+                            RegLocation rl_src2)
+{
+    /*
+     * To pull off inline multiply, we have a worst-case requirement of 8 temporary
+     * registers.  Normally for Arm, we get 5.  We can get to 6 by including
+     * lr in the temp set.  The only problematic case is all operands and result are
+     * distinct, and none have been promoted.  In that case, we can succeed by aggressively
+     * freeing operand temp registers after they are no longer needed.  All other cases
+     * can proceed normally.  We'll just punt on the case of the result having a misaligned
+     * overlap with either operand and send that case to a runtime handler.
+     */
+    RegLocation rl_result;
+    if (BadOverlap(cu, rl_src1, rl_dest) || (BadOverlap(cu, rl_src2, rl_dest))) {
+      int func_offset = ENTRYPOINT_OFFSET(pLmul);
+      FlushAllRegs(cu);
+      CallRuntimeHelperRegLocationRegLocation(cu, func_offset, rl_src1, rl_src2, false);
+      rl_result = GetReturnWide(cu, false);
+      StoreValueWide(cu, rl_dest, rl_result);
+      return;
+    }
+    // Temporarily add LR to the temp pool, and assign it to tmp1
+    MarkTemp(cu, rARM_LR);
+    FreeTemp(cu, rARM_LR);
+    int tmp1 = rARM_LR;
+    LockTemp(cu, rARM_LR);
+
+    rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
+    rl_src2 = LoadValueWide(cu, rl_src2, kCoreReg);
+
+    bool special_case = true;
+    // If operands are the same, or any pair has been promoted we're not the special case.
+    if ((rl_src1.s_reg_low == rl_src2.s_reg_low) ||
+        (!IsTemp(cu, rl_src1.low_reg) && !IsTemp(cu, rl_src1.high_reg)) ||
+        (!IsTemp(cu, rl_src2.low_reg) && !IsTemp(cu, rl_src2.high_reg))) {
+      special_case = false;
+    }
+    // Tuning: if rl_dest has been promoted and is *not* either operand, could use directly.
+    int res_lo = AllocTemp(cu);
+    int res_hi;
+    if (rl_src1.low_reg == rl_src2.low_reg) {
+      res_hi = AllocTemp(cu);
+      NewLIR3(cu, kThumb2MulRRR, tmp1, rl_src1.low_reg, rl_src1.high_reg);
+      NewLIR4(cu, kThumb2Umull, res_lo, res_hi, rl_src1.low_reg, rl_src1.low_reg);
+      OpRegRegRegShift(cu, kOpAdd, res_hi, res_hi, tmp1, EncodeShift(kArmLsl, 1));
+    } else {
+      // In the special case, all temps are now allocated
+      NewLIR3(cu, kThumb2MulRRR, tmp1, rl_src2.low_reg, rl_src1.high_reg);
+      if (special_case) {
+        DCHECK_NE(rl_src1.low_reg, rl_src2.low_reg);
+        DCHECK_NE(rl_src1.high_reg, rl_src2.high_reg);
+        FreeTemp(cu, rl_src1.high_reg);
+      }
+      res_hi = AllocTemp(cu);
+
+      NewLIR4(cu, kThumb2Umull, res_lo, res_hi, rl_src2.low_reg, rl_src1.low_reg);
+      NewLIR4(cu, kThumb2Mla, tmp1, rl_src1.low_reg, rl_src2.high_reg, tmp1);
+      NewLIR4(cu, kThumb2AddRRR, res_hi, tmp1, res_hi, 0);
+      if (special_case) {
+        FreeTemp(cu, rl_src1.low_reg);
+        Clobber(cu, rl_src1.low_reg);
+        Clobber(cu, rl_src1.high_reg);
+      }
+    }
+    FreeTemp(cu, tmp1);
+    rl_result = GetReturnWide(cu, false); // Just using as a template.
+    rl_result.low_reg = res_lo;
+    rl_result.high_reg = res_hi;
+    StoreValueWide(cu, rl_dest, rl_result);
+    // Now, restore lr to its non-temp status.
+    Clobber(cu, rARM_LR);
+    UnmarkTemp(cu, rARM_LR);
+}
+
 bool ArmCodegen::GenAddLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2)
 {
@@ -568,8 +720,11 @@
   int len_offset = mirror::Array::LengthOffset().Int32Value();
   int data_offset;
   RegLocation rl_result;
+  bool constant_index = rl_index.is_const;
   rl_array = LoadValue(cu, rl_array, kCoreReg);
-  rl_index = LoadValue(cu, rl_index, kCoreReg);
+  if (!constant_index) {
+    rl_index = LoadValue(cu, rl_index, kCoreReg);
+  }
 
   if (rl_dest.wide) {
     data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Int32Value();
@@ -577,6 +732,11 @@
     data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Int32Value();
   }
 
+  // If index is constant, just fold it into the data offset
+  if (constant_index) {
+    data_offset += ConstantValue(cu, rl_index) << scale;
+  }
+
   /* null object? */
   GenNullCheck(cu, rl_array.s_reg_low, rl_array.low_reg, opt_flags);
 
@@ -587,27 +747,38 @@
     /* Get len */
     LoadWordDisp(cu, rl_array.low_reg, len_offset, reg_len);
   }
-  if (rl_dest.wide || rl_dest.fp) {
-    // No special indexed operation, lea + load w/ displacement
-    int reg_ptr = AllocTemp(cu);
-    OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
-                     EncodeShift(kArmLsl, scale));
-    FreeTemp(cu, rl_index.low_reg);
+  if (rl_dest.wide || rl_dest.fp || constant_index) {
+    int reg_ptr;
+    if (constant_index) {
+      reg_ptr = rl_array.low_reg;  // NOTE: must not alter reg_ptr in constant case.
+    } else {
+      // No special indexed operation, lea + load w/ displacement
+      reg_ptr = AllocTemp(cu);
+      OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
+                       EncodeShift(kArmLsl, scale));
+      FreeTemp(cu, rl_index.low_reg);
+    }
     rl_result = EvalLoc(cu, rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      // TODO: change kCondCS to a more meaningful name, is the sense of
-      // carry-set/clear flipped?
-      GenRegRegCheck(cu, kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      if (constant_index) {
+        GenImmedCheck(cu, kCondLs, reg_len, ConstantValue(cu, rl_index), kThrowConstantArrayBounds);
+      } else {
+        GenRegRegCheck(cu, kCondLs, reg_len, rl_index.low_reg, kThrowArrayBounds);
+      }
       FreeTemp(cu, reg_len);
     }
     if (rl_dest.wide) {
       LoadBaseDispWide(cu, reg_ptr, data_offset, rl_result.low_reg, rl_result.high_reg, INVALID_SREG);
-      FreeTemp(cu, reg_ptr);
+      if (!constant_index) {
+        FreeTemp(cu, reg_ptr);
+      }
       StoreValueWide(cu, rl_dest, rl_result);
     } else {
       LoadBaseDisp(cu, reg_ptr, data_offset, rl_result.low_reg, size, INVALID_SREG);
-      FreeTemp(cu, reg_ptr);
+      if (!constant_index) {
+        FreeTemp(cu, reg_ptr);
+      }
       StoreValue(cu, rl_dest, rl_result);
     }
   } else {
@@ -639,17 +810,28 @@
   RegisterClass reg_class = oat_reg_class_by_size(size);
   int len_offset = mirror::Array::LengthOffset().Int32Value();
   int data_offset;
+  bool constant_index = rl_index.is_const;
 
-  if (size == kLong || size == kDouble) {
+  if (rl_src.wide) {
     data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Int32Value();
   } else {
     data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Int32Value();
   }
 
+  // If index is constant, just fold it into the data offset.
+  if (constant_index) {
+    data_offset += ConstantValue(cu, rl_index) << scale;
+  }
+
   rl_array = LoadValue(cu, rl_array, kCoreReg);
-  rl_index = LoadValue(cu, rl_index, kCoreReg);
-  int reg_ptr = INVALID_REG;
-  if (IsTemp(cu, rl_array.low_reg)) {
+  if (!constant_index) {
+    rl_index = LoadValue(cu, rl_index, kCoreReg);
+  }
+
+  int reg_ptr;
+  if (constant_index) {
+    reg_ptr = rl_array.low_reg;
+  } else if (IsTemp(cu, rl_array.low_reg)) {
     Clobber(cu, rl_array.low_reg);
     reg_ptr = rl_array.low_reg;
   } else {
@@ -668,18 +850,25 @@
     LoadWordDisp(cu, rl_array.low_reg, len_offset, reg_len);
   }
   /* at this point, reg_ptr points to array, 2 live temps */
-  if (rl_src.wide || rl_src.fp) {
+  if (rl_src.wide || rl_src.fp || constant_index) {
     if (rl_src.wide) {
       rl_src = LoadValueWide(cu, rl_src, reg_class);
     } else {
       rl_src = LoadValue(cu, rl_src, reg_class);
     }
-    OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
-                     EncodeShift(kArmLsl, scale));
+    if (!constant_index) {
+      OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
+                       EncodeShift(kArmLsl, scale));
+    }
     if (needs_range_check) {
-      GenRegRegCheck(cu, kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      if (constant_index) {
+        GenImmedCheck(cu, kCondLs, reg_len, ConstantValue(cu, rl_index), kThrowConstantArrayBounds);
+      } else {
+        GenRegRegCheck(cu, kCondLs, reg_len, rl_index.low_reg, kThrowArrayBounds);
+      }
       FreeTemp(cu, reg_len);
     }
+
     if (rl_src.wide) {
       StoreBaseDispWide(cu, reg_ptr, data_offset, rl_src.low_reg, rl_src.high_reg);
     } else {
@@ -696,7 +885,9 @@
     StoreBaseIndexed(cu, reg_ptr, rl_index.low_reg, rl_src.low_reg,
                      scale, size);
   }
-  FreeTemp(cu, reg_ptr);
+  if (!constant_index) {
+    FreeTemp(cu, reg_ptr);
+  }
 }
 
 /*
@@ -758,4 +949,163 @@
   MarkGCCard(cu, r_value, r_array);
 }
 
+bool ArmCodegen::GenShiftImmOpLong(CompilationUnit* cu, Instruction::Code opcode,
+                                   RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift)
+{
+  rl_src = LoadValueWide(cu, rl_src, kCoreReg);
+  // Per spec, we only care about low 6 bits of shift amount.
+  int shift_amount = ConstantValue(cu, rl_shift) & 0x3f;
+  if (shift_amount == 0) {
+    StoreValueWide(cu, rl_dest, rl_src);
+    return false; // TODO: remove useless bool return result.
+  }
+  if (BadOverlap(cu, rl_src, rl_dest)) {
+    return GenShiftOpLong(cu, opcode, rl_dest, rl_src, rl_shift);
+  }
+  RegLocation rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
+  switch(opcode) {
+    case Instruction::SHL_LONG:
+    case Instruction::SHL_LONG_2ADDR:
+      if (shift_amount == 1) {
+        OpRegRegReg(cu, kOpAdd, rl_result.low_reg, rl_src.low_reg, rl_src.low_reg);
+        OpRegRegReg(cu, kOpAdc, rl_result.high_reg, rl_src.high_reg, rl_src.high_reg);
+      } else if (shift_amount == 32) {
+        OpRegCopy(cu, rl_result.high_reg, rl_src.low_reg);
+        LoadConstant(cu, rl_result.low_reg, 0);
+      } else if (shift_amount > 31) {
+        OpRegRegImm(cu, kOpLsl, rl_result.high_reg, rl_src.low_reg, shift_amount - 32);
+        LoadConstant(cu, rl_result.low_reg, 0);
+      } else {
+        OpRegRegImm(cu, kOpLsl, rl_result.high_reg, rl_src.high_reg, shift_amount);
+        OpRegRegRegShift(cu, kOpOr, rl_result.high_reg, rl_result.high_reg, rl_src.low_reg,
+                         EncodeShift(kArmLsr, 32 - shift_amount));
+        OpRegRegImm(cu, kOpLsl, rl_result.low_reg, rl_src.low_reg, shift_amount);
+      }
+      break;
+    case Instruction::SHR_LONG:
+    case Instruction::SHR_LONG_2ADDR:
+      if (shift_amount == 32) {
+        OpRegCopy(cu, rl_result.low_reg, rl_src.high_reg);
+        OpRegRegImm(cu, kOpAsr, rl_result.high_reg, rl_src.high_reg, 31);
+      } else if (shift_amount > 31) {
+        OpRegRegImm(cu, kOpAsr, rl_result.low_reg, rl_src.high_reg, shift_amount - 32);
+        OpRegRegImm(cu, kOpAsr, rl_result.high_reg, rl_src.high_reg, 31);
+      } else {
+        int t_reg = AllocTemp(cu);
+        OpRegRegImm(cu, kOpLsr, t_reg, rl_src.low_reg, shift_amount);
+        OpRegRegRegShift(cu, kOpOr, rl_result.low_reg, t_reg, rl_src.high_reg,
+                         EncodeShift(kArmLsl, 32 - shift_amount));
+        FreeTemp(cu, t_reg);
+        OpRegRegImm(cu, kOpAsr, rl_result.high_reg, rl_src.high_reg, shift_amount);
+      }
+      break;
+    case Instruction::USHR_LONG:
+    case Instruction::USHR_LONG_2ADDR:
+      if (shift_amount == 32) {
+        OpRegCopy(cu, rl_result.low_reg, rl_src.high_reg);
+        LoadConstant(cu, rl_result.high_reg, 0);
+      } else if (shift_amount > 31) {
+        OpRegRegImm(cu, kOpLsr, rl_result.low_reg, rl_src.high_reg, shift_amount - 32);
+        LoadConstant(cu, rl_result.high_reg, 0);
+      } else {
+        int t_reg = AllocTemp(cu);
+        OpRegRegImm(cu, kOpLsr, t_reg, rl_src.low_reg, shift_amount);
+        OpRegRegRegShift(cu, kOpOr, rl_result.low_reg, t_reg, rl_src.high_reg,
+                         EncodeShift(kArmLsl, 32 - shift_amount));
+        FreeTemp(cu, t_reg);
+        OpRegRegImm(cu, kOpLsr, rl_result.high_reg, rl_src.high_reg, shift_amount);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unexpected case";
+      return true;
+  }
+  StoreValueWide(cu, rl_dest, rl_result);
+  return false;
+}
+
+bool ArmCodegen::GenArithImmOpLong(CompilationUnit* cu, Instruction::Code opcode,
+                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2)
+{
+  if ((opcode == Instruction::SUB_LONG_2ADDR) || (opcode == Instruction::SUB_LONG)) {
+    if (!rl_src2.is_const) {
+      // Don't bother with special handling for subtract from immediate.
+      return GenArithOpLong(cu, opcode, rl_dest, rl_src1, rl_src2);
+    }
+  } else {
+    // Normalize
+    if (!rl_src2.is_const) {
+      DCHECK(rl_src1.is_const);
+      RegLocation rl_temp = rl_src1;
+      rl_src1 = rl_src2;
+      rl_src2 = rl_temp;
+    }
+  }
+  if (BadOverlap(cu, rl_src1, rl_dest)) {
+    return GenArithOpLong(cu, opcode, rl_dest, rl_src1, rl_src2);
+  }
+  DCHECK(rl_src2.is_const);
+  int64_t val = ConstantValueWide(cu, rl_src2);
+  uint32_t val_lo = Low32Bits(val);
+  uint32_t val_hi = High32Bits(val);
+  int32_t mod_imm_lo = ModifiedImmediate(val_lo);
+  int32_t mod_imm_hi = ModifiedImmediate(val_hi);
+
+  // Only a subset of add/sub immediate instructions set carry - so bail if we don't fit
+  switch(opcode) {
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+    case Instruction::SUB_LONG:
+    case Instruction::SUB_LONG_2ADDR:
+      if ((mod_imm_lo < 0) || (mod_imm_hi < 0)) {
+        return GenArithOpLong(cu, opcode, rl_dest, rl_src1, rl_src2);
+      }
+      break;
+    default:
+      break;
+  }
+  rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
+  RegLocation rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
+  // NOTE: once we've done the EvalLoc on dest, we can no longer bail.
+  switch (opcode) {
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+      NewLIR3(cu, kThumb2AddRRI8, rl_result.low_reg, rl_src1.low_reg, mod_imm_lo);
+      NewLIR3(cu, kThumb2AdcRRI8, rl_result.high_reg, rl_src1.high_reg, mod_imm_hi);
+      break;
+    case Instruction::OR_LONG:
+    case Instruction::OR_LONG_2ADDR:
+      if ((val_lo != 0) || (rl_result.low_reg != rl_src1.low_reg)) {
+        OpRegRegImm(cu, kOpOr, rl_result.low_reg, rl_src1.low_reg, val_lo);
+      }
+      if ((val_hi != 0) || (rl_result.high_reg != rl_src1.high_reg)) {
+        OpRegRegImm(cu, kOpOr, rl_result.high_reg, rl_src1.high_reg, val_hi);
+      }
+      break;
+    case Instruction::XOR_LONG:
+    case Instruction::XOR_LONG_2ADDR:
+      OpRegRegImm(cu, kOpXor, rl_result.low_reg, rl_src1.low_reg, val_lo);
+      OpRegRegImm(cu, kOpXor, rl_result.high_reg, rl_src1.high_reg, val_hi);
+      break;
+    case Instruction::AND_LONG:
+    case Instruction::AND_LONG_2ADDR:
+      if ((val_lo != 0xffffffff) || (rl_result.low_reg != rl_src1.low_reg)) {
+        OpRegRegImm(cu, kOpAnd, rl_result.low_reg, rl_src1.low_reg, val_lo);
+      }
+      if ((val_hi != 0xffffffff) || (rl_result.high_reg != rl_src1.high_reg)) {
+        OpRegRegImm(cu, kOpAnd, rl_result.high_reg, rl_src1.high_reg, val_hi);
+      }
+      break;
+    case Instruction::SUB_LONG_2ADDR:
+    case Instruction::SUB_LONG:
+      NewLIR3(cu, kThumb2SubRRI8, rl_result.low_reg, rl_src1.low_reg, mod_imm_lo);
+      NewLIR3(cu, kThumb2SbcRRI8, rl_result.high_reg, rl_src1.high_reg, mod_imm_hi);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected opcode " << opcode;
+  }
+  StoreValueWide(cu, rl_dest, rl_result);
+  return false;  // TODO: remove bool return value from all of these Gen routines.
+}
+
 }  // namespace art