blob: 889f990df2f7565422e5fa5190f4b2c687e8768d [file] [log] [blame]
Sami Tolvanenc54a33d2015-06-26 14:28:31 +01001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <fcntl.h>
18#include <stdlib.h>
19#include <sys/mman.h>
20
21extern "C" {
22 #include <fec.h>
23}
24
25#include "fec_private.h"
26
27using rs_unique_ptr = std::unique_ptr<void, decltype(&free_rs_char)>;
28
29/* prints a hexdump of `data' using warn(...) */
30static void dump(const char *name, uint64_t value, const uint8_t *data,
31 size_t size)
32{
33 const int bytes_per_line = 16;
34 char hex[bytes_per_line * 3 + 1];
35 char prn[bytes_per_line + 1];
36
37 warn("%s (%" PRIu64 ") (%zu bytes):", name ? name : "", value, size);
38
39 if (!data) {
40 warn(" (null)");
41 return;
42 }
43
44 for (size_t n = 0; n < size; n += bytes_per_line) {
45 memset(hex, 0, sizeof(hex));
46 memset(prn, 0, sizeof(prn));
47
48 for (size_t m = 0; m < bytes_per_line; ++m) {
49 if (n + m < size) {
George Burgess IVac4f7e72016-03-02 14:15:49 -080050 ptrdiff_t offset = &hex[m * 3] - hex;
51 snprintf(hex + offset, sizeof(hex) - offset, "%02x ",
52 data[n + m]);
Sami Tolvanenc54a33d2015-06-26 14:28:31 +010053
54 if (isprint(data[n + m])) {
55 prn[m] = data[n + m];
56 } else {
57 prn[m] = '.';
58 }
59 } else {
60 strcpy(&hex[m * 3], " ");
61 }
62 }
63
64 warn(" %04zu %s %s", n, hex, prn);
65 }
66}
67
68/* checks if `offset' is within a corrupted block */
69static inline bool is_erasure(fec_handle *f, uint64_t offset,
70 const uint8_t *data)
71{
72 if (unlikely(offset >= f->data_size)) {
73 return false;
74 }
75
76 /* ideally, we would like to know if a specific byte on this block has
77 been corrupted, but knowing whether any of them is can be useful as
78 well, because often the entire block is corrupted */
79
80 uint64_t n = offset / FEC_BLOCKSIZE;
81
Tianjie Xu622e6282019-12-02 12:55:33 -080082 return !f->hashtree().check_block_hash_with_index(n, data);
Sami Tolvanenc54a33d2015-06-26 14:28:31 +010083}
84
85/* check if `offset' is within a block expected to contain zeros */
86static inline bool is_zero(fec_handle *f, uint64_t offset)
87{
Tianjie Xuef9dc362019-11-20 12:18:40 -080088 auto hashtree = f->hashtree();
Sami Tolvanenc54a33d2015-06-26 14:28:31 +010089
Tianjie Xu622e6282019-12-02 12:55:33 -080090 if (hashtree.hash_data.empty() || unlikely(offset >= f->data_size)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +010091 return false;
92 }
93
94 uint64_t hash_offset = (offset / FEC_BLOCKSIZE) * SHA256_DIGEST_LENGTH;
95
Tianjie Xu622e6282019-12-02 12:55:33 -080096 if (unlikely(hash_offset >
97 hashtree.hash_data.size() - SHA256_DIGEST_LENGTH)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +010098 return false;
99 }
100
Tianjie Xu622e6282019-12-02 12:55:33 -0800101 return !memcmp(hashtree.zero_hash.data(), &hashtree.hash_data[hash_offset],
Tianjie Xuef9dc362019-11-20 12:18:40 -0800102 SHA256_DIGEST_LENGTH);
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100103}
104
105/* reads and decodes a single block starting from `offset', returns the number
106 of bytes corrected in `errors' */
107static int __ecc_read(fec_handle *f, void *rs, uint8_t *dest, uint64_t offset,
108 bool use_erasures, uint8_t *ecc_data, size_t *errors)
109{
110 check(offset % FEC_BLOCKSIZE == 0);
111 ecc_info *e = &f->ecc;
112
113 /* reverse interleaving: calculate the RS block that includes the requested
114 offset */
115 uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) *
116 e->rounds * FEC_BLOCKSIZE;
117 int data_index = -1;
118 int erasures[e->rsn];
119 int neras = 0;
120
121 /* verity is required to check for erasures */
Tianjie Xu622e6282019-12-02 12:55:33 -0800122 check(!use_erasures || !f->hashtree().hash_data.empty());
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100123
124 for (int i = 0; i < e->rsn; ++i) {
125 uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn,
126 e->rounds);
127
128 if (interleaved == offset) {
129 data_index = i;
130 }
131
Sami Tolvanene9677cc2015-12-17 13:36:03 +0000132 /* to improve our chances of correcting IO errors, initialize the
133 buffer to zeros even if we are going to read to it later */
134 uint8_t bbuf[FEC_BLOCKSIZE] = {0};
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100135
Sami Tolvanene9677cc2015-12-17 13:36:03 +0000136 if (likely(interleaved < e->start) && !is_zero(f, interleaved)) {
137 /* copy raw data to reconstruct the RS block */
Tianjie Xuef9dc362019-11-20 12:18:40 -0800138 if (!raw_pread(f->fd, bbuf, FEC_BLOCKSIZE, interleaved)) {
Sami Tolvanene9677cc2015-12-17 13:36:03 +0000139 warn("failed to read: %s", strerror(errno));
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100140
Sami Tolvanene9677cc2015-12-17 13:36:03 +0000141 /* treat errors as corruption */
142 if (use_erasures && neras <= e->roots) {
143 erasures[neras++] = i;
144 }
145 } else if (use_erasures && neras <= e->roots &&
Tianjie Xuef9dc362019-11-20 12:18:40 -0800146 is_erasure(f, interleaved, bbuf)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100147 erasures[neras++] = i;
148 }
149 }
150
151 for (int j = 0; j < FEC_BLOCKSIZE; ++j) {
152 ecc_data[j * FEC_RSM + i] = bbuf[j];
153 }
154 }
155
156 check(data_index >= 0);
157
158 size_t nerrs = 0;
159 uint8_t copy[FEC_RSM];
160
161 for (int i = 0; i < FEC_BLOCKSIZE; ++i) {
162 /* copy parity data */
Tianjie Xuef9dc362019-11-20 12:18:40 -0800163 if (!raw_pread(f->fd, &ecc_data[i * FEC_RSM + e->rsn], e->roots,
164 e->start + (i + rsb) * e->roots)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100165 error("failed to read ecc data: %s", strerror(errno));
166 return -1;
167 }
168
169 /* for debugging decoding failures, because decode_rs_char can mangle
170 ecc_data */
171 if (unlikely(use_erasures)) {
172 memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM);
173 }
174
175 /* decode */
176 int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras);
177
178 if (unlikely(rc < 0)) {
179 if (use_erasures) {
180 error("RS block %" PRIu64 ": decoding failed (%d erasures)",
181 rsb, neras);
182 dump("raw RS block", rsb, copy, FEC_RSM);
Tianjie Xu622e6282019-12-02 12:55:33 -0800183 } else if (f->hashtree().hash_data.empty()) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100184 warn("RS block %" PRIu64 ": decoding failed", rsb);
185 } else {
186 debug("RS block %" PRIu64 ": decoding failed", rsb);
187 }
188
189 errno = EIO;
190 return -1;
191 } else if (unlikely(rc > 0)) {
192 check(rc <= (use_erasures ? e->roots : e->roots / 2));
193 nerrs += rc;
194 }
195
196 dest[i] = ecc_data[i * FEC_RSM + data_index];
197 }
198
199 if (nerrs) {
200 warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs);
201 *errors += nerrs;
202 }
203
204 return FEC_BLOCKSIZE;
205}
206
207/* initializes RS decoder and allocates memory for interleaving */
208static int ecc_init(fec_handle *f, rs_unique_ptr& rs,
209 std::unique_ptr<uint8_t[]>& ecc_data)
210{
211 check(f);
212
213 rs.reset(init_rs_char(FEC_PARAMS(f->ecc.roots)));
214
215 if (unlikely(!rs)) {
216 error("failed to initialize RS");
217 errno = ENOMEM;
218 return -1;
219 }
220
221 ecc_data.reset(new (std::nothrow) uint8_t[FEC_RSM * FEC_BLOCKSIZE]);
222
223 if (unlikely(!ecc_data)) {
224 error("failed to allocate ecc buffer");
225 errno = ENOMEM;
226 return -1;
227 }
228
229 return 0;
230}
231
232/* reads `count' bytes from `offset' and corrects possible errors without
233 erasure detection, returning the number of corrected bytes in `errors' */
234static ssize_t ecc_read(fec_handle *f, uint8_t *dest, size_t count,
235 uint64_t offset, size_t *errors)
236{
237 check(f);
238 check(dest);
239 check(offset < f->data_size);
240 check(offset + count <= f->data_size);
241 check(errors);
242
243 debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);
244
245 rs_unique_ptr rs(NULL, free_rs_char);
246 std::unique_ptr<uint8_t[]> ecc_data;
247
248 if (ecc_init(f, rs, ecc_data) == -1) {
249 return -1;
250 }
251
252 uint64_t curr = offset / FEC_BLOCKSIZE;
253 size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
254 size_t left = count;
255
256 uint8_t data[FEC_BLOCKSIZE];
257
258 while (left > 0) {
259 /* there's no erasure detection without verity metadata */
260 if (__ecc_read(f, rs.get(), data, curr * FEC_BLOCKSIZE, false,
261 ecc_data.get(), errors) == -1) {
262 return -1;
263 }
264
265 size_t copy = FEC_BLOCKSIZE - coff;
266
267 if (copy > left) {
268 copy = left;
269 }
270
271 memcpy(dest, &data[coff], copy);
272
273 dest += copy;
274 left -= copy;
275 coff = 0;
276 ++curr;
277 }
278
279 return count;
280}
281
282/* reads `count' bytes from `offset', corrects possible errors with
283 erasure detection, and verifies the integrity of read data using
284 verity hash tree; returns the number of corrections in `errors' */
285static ssize_t verity_read(fec_handle *f, uint8_t *dest, size_t count,
286 uint64_t offset, size_t *errors)
287{
288 check(f);
289 check(dest);
290 check(offset < f->data_size);
291 check(offset + count <= f->data_size);
Tianjie Xu622e6282019-12-02 12:55:33 -0800292 check(!f->hashtree().hash_data.empty());
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100293 check(errors);
294
295 debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);
296
297 rs_unique_ptr rs(NULL, free_rs_char);
298 std::unique_ptr<uint8_t[]> ecc_data;
299
300 if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) {
301 return -1;
302 }
303
304 uint64_t curr = offset / FEC_BLOCKSIZE;
305 size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
306 size_t left = count;
307 uint8_t data[FEC_BLOCKSIZE];
308
Tianjie Xu622e6282019-12-02 12:55:33 -0800309 uint64_t max_hash_block =
310 (f->hashtree().hash_data.size() - SHA256_DIGEST_LENGTH) /
311 SHA256_DIGEST_LENGTH;
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100312
313 while (left > 0) {
314 check(curr <= max_hash_block);
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100315 uint64_t curr_offset = curr * FEC_BLOCKSIZE;
316
317 bool expect_zeros = is_zero(f, curr_offset);
318
319 /* if we are in read-only mode and expect to read a zero block,
320 skip reading and just return zeros */
Sami Tolvanen97988252018-08-02 11:13:15 -0700321 if ((f->mode & O_ACCMODE) == O_RDONLY && expect_zeros) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100322 memset(data, 0, FEC_BLOCKSIZE);
323 goto valid;
324 }
325
326 /* copy raw data without error correction */
Tianjie Xuef9dc362019-11-20 12:18:40 -0800327 if (!raw_pread(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100328 error("failed to read: %s", strerror(errno));
329 return -1;
330 }
331
Tianjie Xu622e6282019-12-02 12:55:33 -0800332 if (likely(f->hashtree().check_block_hash_with_index(curr, data))) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100333 goto valid;
334 }
335
336 /* we know the block is supposed to contain zeros, so return zeros
337 instead of trying to correct it */
338 if (expect_zeros) {
339 memset(data, 0, FEC_BLOCKSIZE);
340 goto corrected;
341 }
342
343 if (!f->ecc.start) {
344 /* fatal error without ecc */
345 error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
346 offset, offset + count, curr);
347 return -1;
348 } else {
349 debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
350 offset, offset + count, curr);
351 }
352
353 /* try to correct without erasures first, because checking for
354 erasure locations is slower */
355 if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(),
Tianjie Xuef9dc362019-11-20 12:18:40 -0800356 errors) == FEC_BLOCKSIZE &&
Tianjie Xu622e6282019-12-02 12:55:33 -0800357 f->hashtree().check_block_hash_with_index(curr, data)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100358 goto corrected;
359 }
360
361 /* try to correct with erasures */
362 if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(),
Tianjie Xuef9dc362019-11-20 12:18:40 -0800363 errors) == FEC_BLOCKSIZE &&
Tianjie Xu622e6282019-12-02 12:55:33 -0800364 f->hashtree().check_block_hash_with_index(curr, data)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100365 goto corrected;
366 }
367
368 error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64
369 " (offset %" PRIu64 ") cannot be recovered",
370 offset, offset + count, curr, curr_offset);
371 dump("decoded block", curr, data, FEC_BLOCKSIZE);
372
373 errno = EIO;
374 return -1;
375
376corrected:
377 /* update the corrected block to the file if we are in r/w mode */
378 if (f->mode & O_RDWR &&
Tianjie Xuef9dc362019-11-20 12:18:40 -0800379 !raw_pwrite(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100380 error("failed to write: %s", strerror(errno));
381 return -1;
382 }
383
384valid:
385 size_t copy = FEC_BLOCKSIZE - coff;
386
387 if (copy > left) {
388 copy = left;
389 }
390
391 memcpy(dest, &data[coff], copy);
392
393 dest += copy;
394 left -= copy;
395 coff = 0;
396 ++curr;
397 }
398
399 return count;
400}
401
402/* sets the internal file position to `offset' relative to `whence' */
403int fec_seek(struct fec_handle *f, int64_t offset, int whence)
404{
405 check(f);
406
407 if (whence == SEEK_SET) {
408 if (offset < 0) {
409 errno = EOVERFLOW;
410 return -1;
411 }
412
413 f->pos = offset;
414 } else if (whence == SEEK_CUR) {
415 if (offset < 0 && f->pos < (uint64_t)-offset) {
416 errno = EOVERFLOW;
417 return -1;
418 } else if (offset > 0 && (uint64_t)offset > UINT64_MAX - f->pos) {
419 errno = EOVERFLOW;
420 return -1;
421 }
422
423 f->pos += offset;
424 } else if (whence == SEEK_END) {
425 if (offset >= 0) {
426 errno = ENXIO;
427 return -1;
428 } else if ((uint64_t)-offset > f->size) {
429 errno = EOVERFLOW;
430 return -1;
431 }
432
433 f->pos = f->size + offset;
434 } else {
435 errno = EINVAL;
436 return -1;
437 }
438
439 return 0;
440}
441
442/* reads up to `count' bytes starting from the internal file position using
443 error correction and integrity validation, if available */
444ssize_t fec_read(struct fec_handle *f, void *buf, size_t count)
445{
446 ssize_t rc = fec_pread(f, buf, count, f->pos);
447
448 if (rc > 0) {
449 check(f->pos < UINT64_MAX - rc);
450 f->pos += rc;
451 }
452
453 return rc;
454}
455
456/* for a file with size `max', returns the number of bytes we can read starting
457 from `offset', up to `count' bytes */
458static inline size_t get_max_count(uint64_t offset, size_t count, uint64_t max)
459{
460 if (offset >= max) {
461 return 0;
462 } else if (offset > max - count) {
463 return (size_t)(max - offset);
464 }
465
466 return count;
467}
468
469/* reads `count' bytes from `f->fd' starting from `offset', and copies the
470 data to `buf' */
Tianjie Xuef9dc362019-11-20 12:18:40 -0800471bool raw_pread(int fd, void *buf, size_t count, uint64_t offset) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100472 check(buf);
473
474 uint8_t *p = (uint8_t *)buf;
475 size_t remaining = count;
476
477 while (remaining > 0) {
Tianjie Xuef9dc362019-11-20 12:18:40 -0800478 ssize_t n = TEMP_FAILURE_RETRY(pread64(fd, p, remaining, offset));
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100479
480 if (n <= 0) {
481 return false;
482 }
483
484 p += n;
485 remaining -= n;
486 offset += n;
487 }
488
489 return true;
490}
491
492/* writes `count' bytes from `buf' to `f->fd' to a file position `offset' */
Tianjie Xuef9dc362019-11-20 12:18:40 -0800493bool raw_pwrite(int fd, const void *buf, size_t count, uint64_t offset) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100494 check(buf);
495
496 const uint8_t *p = (const uint8_t *)buf;
497 size_t remaining = count;
498
499 while (remaining > 0) {
Tianjie Xuef9dc362019-11-20 12:18:40 -0800500 ssize_t n = TEMP_FAILURE_RETRY(pwrite64(fd, p, remaining, offset));
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100501
502 if (n <= 0) {
503 return false;
504 }
505
506 p += n;
507 remaining -= n;
508 offset += n;
509 }
510
511 return true;
512}
513
514/* reads up to `count' bytes starting from `offset' using error correction and
515 integrity validation, if available */
516ssize_t fec_pread(struct fec_handle *f, void *buf, size_t count,
517 uint64_t offset)
518{
519 check(f);
520 check(buf);
521
522 if (unlikely(offset > UINT64_MAX - count)) {
523 errno = EOVERFLOW;
524 return -1;
525 }
526
Tianjie Xu622e6282019-12-02 12:55:33 -0800527 if (!f->hashtree().hash_data.empty()) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100528 return process(f, (uint8_t *)buf,
Tianjie Xuef9dc362019-11-20 12:18:40 -0800529 get_max_count(offset, count, f->data_size), offset,
530 verity_read);
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100531 } else if (f->ecc.start) {
532 check(f->ecc.start < f->size);
533
534 count = get_max_count(offset, count, f->data_size);
535 ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read);
536
537 if (rc >= 0) {
538 return rc;
539 }
540
541 /* return raw data if pure ecc read fails; due to interleaving
542 the specific blocks the caller wants may still be fine */
543 } else {
544 count = get_max_count(offset, count, f->size);
545 }
546
Tianjie Xuef9dc362019-11-20 12:18:40 -0800547 if (raw_pread(f->fd, buf, count, offset)) {
Sami Tolvanenc54a33d2015-06-26 14:28:31 +0100548 return count;
549 }
550
551 return -1;
552}