Launch options & multi inputs for single-source RS

Bug: 23535985

Also renamed rsParallelFor to rsForEach, and deprecated previous
rsForEach.

Change-Id: Ibc22bd5e9585a4471b15920ef60fe1fe2312de49
diff --git a/api/rs_for_each.spec b/api/rs_for_each.spec
index fee1a71..c9bb2e6 100644
--- a/api/rs_for_each.spec
+++ b/api/rs_for_each.spec
@@ -83,6 +83,15 @@
  over cells 4, 5, 6, and 7 in the X dimension, set xStart to 4 and xEnd to 8.
 end:
 
+type: rs_kernel
+version: UNRELEASED
+simple: void*
+summary: Handle to a kernel function
+description:
+  An opaque type for a function that is defined with the kernel attribute.  A value
+  of this type can be used in a @rsForEach call to launch a kernel.
+end:
+
 function: rsForEach
 version: 9 13
 ret: void
@@ -91,26 +100,34 @@
 arg: rs_allocation output, "Allocation to write date into."
 arg: const void* usrData, "User defined data to pass to the script.  May be NULL."
 arg: const rs_script_call_t* sc, "Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL."
-summary: Invoke the root kernel of a script
+summary: Launches a kernel
 description:
- Invoke the kernel named "root" of the specified script.  Like other kernels, this root()
- function will be invoked repeatedly over the cells of the specificed allocation, filling
- the output allocation with the results.
+ Runs the kernel over zero or more input allocations. They are passed after the
+ @rs_kernel argument. If the specified kernel returns a value, an output allocation
+ must be specified as the last argument. All input allocations,
+ and the output allocation if it exists, must have the same dimensions.
 
- When rsForEach is called, the root script is launched immediately.  rsForEach returns
- only when the script has completed and the output allocation is ready to use.
+ This is a synchronous function. A call to this function only returns after all
+ the work has completed for all cells of the input allocations. If the kernel
+ function returns any value, the call waits until all results have been written
+ to the output allocation.
 
- The rs_script argument is typically initialized using a global variable set from Java.
+ Up to API level 23, the kernel is implicitly specified as the kernel named
+ "root" in the specified script, and only a single input allocation can be used.
+ Starting in API level *UNRELEASED*, an arbitrary kernel function can be used,
+ as specified by the kernel argument. The script argument is removed.
+ The kernel must be defined in the current script. In addition, more than one
+ inputs can be used.
 
- The kernel can be invoked with just an input allocation or just an output allocation.
- This can be done by defining an rs_allocation variable and not initializing it.  E.g.<code><br/>
- rs_script gCustomScript;<br/>
- void specializedProcessing(rs_allocation in) {<br/>
- &nbsp;&nbsp;rs_allocation ignoredOut;<br/>
- &nbsp;&nbsp;rsForEach(gCustomScript, in, ignoredOut);<br/>
- }<br/></code>
-
- If both input and output allocations are specified, they must have the same dimensions.
+E.g.<code><br/>
+ float __attribute__((kernel)) square(float a) {<br/>
+ &nbsp;&nbsp;return a * a;<br/>
+ }<br/>
+<br/>
+ void compute(rs_allocation ain, rs_allocation aout) {<br/>
+ &nbsp;&nbsp;rsForEach(square, ain, aout);<br/>
+ }<br/>
+<br/></code>
 test: none
 end:
 
@@ -148,7 +165,7 @@
 end:
 
 function: rsForEach
-version: 14
+version: 14 23
 ret: void
 arg: rs_script script
 arg: rs_allocation input
@@ -156,33 +173,43 @@
 test: none
 end:
 
-type: rs_kernel
-version: UNRELEASED
-simple: void*
-summary: Handle to a kernel function
-description:
-  An opaque type for a function that is defined with the kernel attribute.  A value
-  of this type can be used in a @rsParallelFor call to launch a kernel.
-end:
-
-function: rsParallelFor
+function: rsForEach
 version: UNRELEASED
 intrinsic: true
-attrib: =
+attrib: =  # Not overloadable
 ret: void
 arg: rs_kernel kernel, "Function designator to a function that is defined with the kernel attribute."
 arg: ..., "Input and output allocations"
-summary: Run a kernel defined in the current Script
-description:
-  Runs the kernel over zero or more input allocations. They are passed after the
-  @rs_kernel argument. If the specified kernel returns a value, an output allocation
-  must be specified as the last argument. All input allocations,
-  and the output allocation if it exists, must have the same dimensions.
+test: none
+end:
 
-  This is a synchronous function. A call to this function only returns after all
-  the work has completed for all cells of the input allocations. If the kernel
-  function returns any value, the call waits until all results have been written
-  to the output allocation.
+function: rsForEachWithOptions
+version: UNRELEASED
+intrinsic: true
+attrib: =  # Not overloadable
+ret: void
+arg: rs_kernel kernel, "Function designator to a function that is defined with the kernel attribute."
+arg: rs_script_call_t* options, "Launch options"
+arg: ..., "Input and output allocations"
+summary: Launches a kernel with options
+description:
+ Launches kernel in a way similar to @rsForEach. However, instead of processing
+ all cells in the input, this function only processes cells in the subspace of
+ the index space specified in options. With the index space explicitly specified
+ by options, no input or output allocation is required for a kernel launch using
+ this API. If allocations are passed in, they must match the number of arguments
+ and return value expected by the kernel function. The output allocation is
+ present if and only if the kernel has a non-void return value.
+
+ E.g., <code><br/>
+    rs_script_call_t opts = {0};<br/>
+    opts.xStart = 0;<br/>
+    opts.xEnd = dimX;<br/>
+    opts.yStart = 0;<br/>
+    opts.yEnd = dimY / 2;<br/>
+    rsForEachWithOptions(foo, &opts, out, out);<br/>
+</code>
+
 test: none
 end:
 
@@ -191,6 +218,7 @@
 internal: true
 ret: void
 arg: int slot
+arg: rs_script_call_t* options
 arg: rs_allocation input
 arg: rs_allocation output
 summary: (Internal API) Launch a kernel in the current Script (with the slot number)
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index 86c4107..07d6b4a 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -433,11 +433,31 @@
 //////////////////////////////////////////////////////////////////////////////
 // ForEach routines
 //////////////////////////////////////////////////////////////////////////////
-void rsForEachInternal(int slot, ::rs_allocation in, ::rs_allocation out) {
+void rsForEachInternal(int slot,
+                       rs_script_call *call,
+                       int hasOutput,
+                       int numIn,
+                       ...) {
     Context *rsc = RsdCpuReference::getTlsContext();
     Script *s = const_cast<Script*>(RsdCpuReference::getTlsScript());
-    rsrForEach(rsc, s, slot, (Allocation *)in.p, (Allocation *)out.p,
-               nullptr, 0, nullptr);
+    std::unique_ptr<Allocation*> inputs(numIn > 0 ? new Allocation*[numIn] : nullptr);
+    if (numIn > 0 && inputs == nullptr) {
+        ALOGE("rsForEachInternal: out of memory for %u inputs.", numIn);
+        return;
+    }
+    Allocation* out = nullptr;
+    va_list argp;
+    va_start(argp, numIn);
+    for (int i = 0; i < numIn; i++) {
+        ::rs_allocation alloc = va_arg(argp, ::rs_allocation);
+        inputs.get()[i] = reinterpret_cast<Allocation*>(const_cast<int*>(alloc.p));
+    }
+    if (hasOutput) {
+        ::rs_allocation outAlloc = va_arg(argp, ::rs_allocation);
+        out = reinterpret_cast<Allocation*>(const_cast<int*>(outAlloc.p));
+    }
+    va_end(argp);
+    rsrForEach(rsc, s, slot, numIn, inputs.get(), out, nullptr, 0, (RsScriptCall*)call);
 }
 
 void __attribute__((overloadable)) rsForEach(::rs_script script,
@@ -446,7 +466,7 @@
                                              const void *usr,
                                              const rs_script_call *call) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, 0, (Allocation *)in.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p,
                (Allocation *)out.p, usr, 0, (RsScriptCall *)call);
 }
 
@@ -455,7 +475,7 @@
                                              ::rs_allocation out,
                                              const void *usr) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, 0, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                usr, 0, nullptr);
 }
 
@@ -463,7 +483,7 @@
                                              ::rs_allocation in,
                                              ::rs_allocation out) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, 0, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                nullptr, 0, nullptr);
 }
 
@@ -475,7 +495,7 @@
                                              const void *usr,
                                              uint32_t usrLen) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, 0, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                usr, usrLen, nullptr);
 }
 
@@ -486,7 +506,7 @@
                                              uint32_t usrLen,
                                              const rs_script_call *call) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, 0, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                usr, usrLen, (RsScriptCall *)call);
 }
 #endif
diff --git a/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs b/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs
index c71b359..e34dd5b 100644
--- a/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs
@@ -7,6 +7,10 @@
     return a * 2;
 }
 
+int __attribute__((kernel)) goo(int a, int b) {
+    return a + b;
+}
+
 static void validate(rs_allocation out) {
     bool failed = false;
 
@@ -15,7 +19,11 @@
     for (j = 0; j < dimY; j++) {
         for (i = 0; i < dimX; i++) {
             const int actual = rsGetElementAt_int(out, i, j);
-            const int expected = (i + j * dimX) * 4;
+            int expected = (i + j * dimX) * 4;
+            if (j < dimY / 2) {
+                expected *= 2;
+            }
+            expected += (i + j * dimX);
             if (actual != expected) {
                 failed = true;
                 rsDebug("row     ", j);
@@ -47,8 +55,16 @@
         }
     }
 
-    rsParallelFor(foo, in, out);
-    rsParallelFor(foo, out, out);
+    rsForEach(foo, in, out);
+    rsForEach(foo, out, out);
+    rs_script_call_t opts = {0};
+    opts.xStart = 0;
+    opts.xEnd = dimX;
+    opts.yStart = 0;
+    opts.yEnd = dimY / 2;
+    rsForEachWithOptions(foo, &opts, out, out);
+
+    rsForEach(goo, in, out, out);
 
     validate(out);
 }
diff --git a/rsRuntime.h b/rsRuntime.h
index bcaebcd..9bc05b3 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -156,7 +156,8 @@
 
 void rsrForEach(Context *, Script *target,
                 uint32_t slot,
-                Allocation *in,
+                uint32_t numInputs,
+                Allocation **in,
                 Allocation *out,
                 const void *usr,
                 uint32_t usrBytes,
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index 9a9c57f..a411e34 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -237,20 +237,11 @@
 void rsrForEach(Context *rsc,
                 Script *target,
                 uint32_t slot,
-                Allocation *in, Allocation *out,
+                uint32_t numInputs,
+                Allocation **in, Allocation *out,
                 const void *usr, uint32_t usrBytes,
                 const RsScriptCall *call) {
-
-    if (in == nullptr) {
-        target->runForEach(rsc, slot, nullptr, 0, out, usr,
-                           usrBytes, call);
-
-    } else {
-        const Allocation *ins[1] = {in};
-        target->runForEach(rsc, slot, ins,
-                           sizeof(ins) / sizeof(RsAllocation), out, usr,
-                           usrBytes, call);
-    }
+    target->runForEach(rsc, slot, (const Allocation**)in, numInputs, out, usr, usrBytes, call);
 }
 
 void rsrAllocationSyncAll(Context *rsc, Allocation *a, RsAllocationUsageType usage) {
diff --git a/scriptc/rs_for_each.rsh b/scriptc/rs_for_each.rsh
index a105cb2..6a42b41 100644
--- a/scriptc/rs_for_each.rsh
+++ b/scriptc/rs_for_each.rsh
@@ -94,7 +94,7 @@
  * rs_kernel: Handle to a kernel function
  *
  *  An opaque type for a function that is defined with the kernel attribute.  A value
- *  of this type can be used in a rsParallelFor call to launch a kernel.
+ *  of this type can be used in a rsForEach call to launch a kernel.
  */
 #if (defined(RS_VERSION) && (RS_VERSION >= 4294967295) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 4294967295))
 typedef void* rs_kernel;
@@ -129,6 +129,8 @@
  *   usrData: User defined data to pass to the script.  May be NULL.
  *   sc: Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL.
  *   usrDataLen: Size of the userData structure.  This will be used to perform a shallow copy of the data if necessary.
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   ...: Input and output allocations
  */
 #if !defined(RS_VERSION) || (RS_VERSION <= 13)
 extern void __attribute__((overloadable))
@@ -153,31 +155,29 @@
               size_t usrDataLen);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 23))
 extern void __attribute__((overloadable))
     rsForEach(rs_script script, rs_allocation input, rs_allocation output);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 4294967295) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 4294967295))
+extern void
+    rsForEach(rs_kernel kernel,  ...);
+#endif
+
 /*
- * rsParallelFor: Run a kernel defined in the current Script
+ * rsForEachWithOptions: TBD
  *
- *  Runs the kernel over zero or more input allocations. They are passed after the
- *  rs_kernel argument. If the specified kernel returns a value, an output allocation
- *  must be specified as the last argument. All input allocations,
- *  and the output allocation if it exists, must have the same dimensions.
- *
- *  This is a synchronous function. A call to this function only returns after all
- *  the work has completed for all cells of the input allocations. If the kernel
- *  function returns any value, the call would also have to wait until all results
- *  have been written to the output allocation.
+ *  TBD
  *
  * Parameters:
  *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   options: Launch options
  *   ...: Input and output allocations
  */
 #if (defined(RS_VERSION) && (RS_VERSION >= 4294967295) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 4294967295))
 extern void
-    rsParallelFor(rs_kernel kernel,  ...);
+    rsForEachWithOptions(rs_kernel kernel, rs_script_call_t* options,  ...);
 #endif
 
 /*