libfec: fix thread count calculation

On systems with a large number of cores, smallish reads may not be
able to utilize all the cores. Correctly compute the number of threads
we should use.

Change-Id: I7a05c144c2b83a2f6083c33a686ced32cce576c4
diff --git a/libfec/fec_process.cpp b/libfec/fec_process.cpp
index 3b2846c..6e0ddd1 100644
--- a/libfec/fec_process.cpp
+++ b/libfec/fec_process.cpp
@@ -62,11 +62,13 @@
     uint64_t start = (offset / FEC_BLOCKSIZE) * FEC_BLOCKSIZE;
     size_t blocks = fec_div_round_up(count, FEC_BLOCKSIZE);
 
-    if ((size_t)threads > blocks) {
-        threads = (int)blocks;
+    size_t count_per_thread = fec_div_round_up(blocks, threads) * FEC_BLOCKSIZE;
+    size_t max_threads = fec_div_round_up(count, count_per_thread);
+
+    if ((size_t)threads > max_threads) {
+        threads = (int)max_threads;
     }
 
-    size_t count_per_thread = fec_div_round_up(blocks, threads) * FEC_BLOCKSIZE;
     size_t left = count;
     uint64_t pos = offset;
     uint64_t end = start + count_per_thread;