C version
Clearly, the Python Reed Solomon erasure code library in the previous post isn't meant to be fast.
Now, let's build a fast C version.
- Convert the code to C line by line.
- Inline calls to gf256_add() and gf256_sub() functions. They are just xors.
- Optimize the multiplication in compute_chunks() into a table lookup. One output byte now requires only K table lookups and K xors.
- Unroll the critical inner loop in compute_chunks() 8 ways to do 64 bit load and stores. This unrolling results in a 2x speed up.
- Rearrange the three loops in compute_chunks() to read the input chunks once only.
- The two inner loops can be swapped. If number of output chunks is small (like 3), it is slightly faster to loop over the output chunks as the innermost loop.
// cc -O3
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <sys/time.h>
typedef uint8_t byte;
typedef byte *bytearray;
typedef uint64_t b8;
// GF(2**8)
byte gf256_log[256]; // zeroth entry is invalid
byte gf256_antilog[255]; // yes, only 255 entries
byte *gf256_mul_table = NULL; // [256*256] row*column multiplication table
byte gf256_slow_mul(byte x, byte y) { // long multiplication
byte r = 0;
while (y > 0) {
if (y&1)
r ^= x; // add
if (x&0x80)
x = (x<<1)^0x1b; // reduce by polynomial 0x11b
else
x <<= 1;
y >>= 1;
}
return r;
}
void gf256_init_tables(void) {
byte z = 1;
for (byte x = 0; x < 255; x++) {
assert(z != 0);
gf256_antilog[x] = z;
gf256_log[z] = x;
z = gf256_slow_mul(z, 3);
}
gf256_mul_table = malloc(256*256);
for (int r = 0; r < 256; r++) {
for (int c = 0; c < 256; c++)
gf256_mul_table[(r<<8)+c] = gf256_slow_mul(r, c);
}
}
byte gf256_div(byte x, byte y) {
assert(y != 0);
if (x == 0)
return 0;
return gf256_antilog[((int) gf256_log[x] - gf256_log[y] + 255) % 255];
}
byte gf256_mul(byte x, byte y) {
return gf256_mul_table[(x<<8)+y];
}
void lag_interp_weights(const int K, const byte *xs, const byte x, byte **w) {
// assume len(xs) == K
// assume len(w) == K
for (int i = 0; i < K; i++) {
int t = 1;
for (int j = 0; j < K; j++) {
if (i != j)
t = gf256_mul(t, gf256_div((x ^ xs[j]), xs[i] ^ xs[j]));
}
w[i] = &gf256_mul_table[t<<8];
}
}
void compute_chunks(const int K, const byte *xs, const int L, const bytearray *chunks, const int T, const byte *ts, const bytearray *results) {
// assume len(xs) == K
// assume len(chunks) == K
// assume len(chunks[i]) == L for i = 0 to K-1
// assume len(ts) == T
// assume len(results) == T
// assume len(results[i]) == L for i = 0 to T-1
// assume not x in xs for x in ts
assert(L % 8 == 0);
bytearray inputs[K];
byte *weights[T][K];
for (int i = 0; i < K; i++)
inputs[i] = chunks[xs[i]];
for (int j = 0; j < T; j++)
lag_interp_weights(K, xs, ts[j], weights[j]);
{
bytearray inp = inputs[0];
for (int z = 0; z < L; z += 8) {
const b8 dd = * (b8*) &inp[z];
const byte d1 = dd;
const byte d2 = dd >> 8;
const byte d3 = dd >> 16;
const byte d4 = dd >> 24;
const byte d5 = dd >> 32;
const byte d6 = dd >> 40;
const byte d7 = dd >> 48;
const byte d8 = dd >> 56;
for (int j = 0; j < T; j++) {
const byte *w = weights[j][0];
* (b8*) &results[j][z] = (b8) w[d1] | (b8) w[d2] << 8 | (b8) w[d3] << 16 | (b8) w[d4] << 24 | (b8) w[d5] << 32 | (b8) w[d6] << 40 | (b8) w[d7] << 48 | (b8) w[d8] << 56;
}
}
}
for (int i = 1; i < K; i++) {
bytearray inp = inputs[i];
for (int z = 0; z < L; z += 8) {
const b8 dd = * (b8*) &inp[z];
const byte d1 = dd;
const byte d2 = dd >> 8;
const byte d3 = dd >> 16;
const byte d4 = dd >> 24;
const byte d5 = dd >> 32;
const byte d6 = dd >> 40;
const byte d7 = dd >> 48;
const byte d8 = dd >> 56;
for (int j = 0; j < T; j++) {
const byte *w = weights[j][i];
* (b8*) &results[j][z] ^= (b8) w[d1] | (b8) w[d2] << 8 | (b8) w[d3] << 16 | (b8) w[d4] << 24 | (b8) w[d5] << 32 | (b8) w[d6] << 40 | (b8) w[d7] << 48 | (b8) w[d8] << 56;
}
}
}
}
double now() {
struct timeval tv;
gettimeofday(&tv, 0);
return (double) tv.tv_sec + tv.tv_usec / 1000000.0;
}
void *alloc_buf(size_t size) {
return malloc(size);
}
void dealloc_buf(void *p, size_t size) {
free(p);
}
void encode_chunks(const int K, const int N, const int L, bytearray *chunks) {
// assume len(chunks) == N.
// assume len(chunks[x]) == L for x = 0 to K-1
// assume chunk[x] == NULL, for x = K to N-1
// computed chunks are stored in chunks[x] for x = K to N-1
assert(K > 0 && K < N && N <= 256);
byte xs[K];
for (int i = 0; i < K; i++)
xs[i] = i;
const int T = N-K;
byte ts[T];
bytearray results[T];
for (int t = 0; t < T; t++) {
const int x = K+t;
ts[t] = x;
results[t] = alloc_buf(L);
chunks[x] = results[t];
}
compute_chunks(K, xs, L, chunks, T, ts, results);
}
void decode_chunks(const int K, const int N, const int L, bytearray *chunks) {
// assume len(chunks) == N.
// assume at least K non-NULL chunks[x] for x = 0 to N-1
// assume len(chunks[x]) == L for x = 0 to N-1 if chunks[x] is not NULL
// computed chunks are stored in chunks[x] for x = 0 to K-1 if chunks[x] is NULL
assert(K > 0 && K < N && N <= 256);
byte xs[K];
int j = 0;
for (int x = 0; x < N && j < K; x++) {
if (chunks[x] != NULL)
xs[j++] = x;
}
assert(j == K);
byte ts[N-K];
bytearray results[N-K];
int T = 0;
for (int x = 0; x < K; x++) {
if (chunks[x] == NULL) {
ts[T] = x;
results[T] = alloc_buf(L);
chunks[x] = results[T];
T += 1;
}
}
compute_chunks(K, xs, L, chunks, T, ts, results);
}
// --- tests below ---
byte *temp = NULL;
const int TEMP_SIZE = 256*1024*1024;
void setup_temp() {
temp = malloc(TEMP_SIZE);
}
void clear_cache() {
for (int i = 0; i < TEMP_SIZE; i++) {
temp[i] = i;
}
printf("Cache cleared\n");
}
void gen_data(const int K, const int N, const int L, bytearray *p) {
srand(1);
for (int i = 0; i < N; i++)
p[i] = NULL;
for (int i = 0; i < K; i++) {
bytearray d = p[i] = alloc_buf(L);
for (int j = 0; j < L; j++)
d[j] = rand();
}
}
int chunks_equal(const int K, const int L, const bytearray *chunks1, const bytearray *chunks2) {
for (int x = 0; x < K; x++) {
if (chunks1[x] != chunks2[x] &&
memcmp(chunks1[x], chunks2[x], L) != 0) {
printf("Chunks %d are not equal", x);
return 0;
}
}
return 1;
}
void test_random(const int K, const int N, const int L, const int encode_rounds, const int decode_rounds) {
bytearray **test_chunks = malloc(encode_rounds*sizeof(bytearray*));
for (int r = 0; r < encode_rounds; r++) {
test_chunks[r] = malloc(N*sizeof(bytearray));
gen_data(K, N, L, test_chunks[r]);
}
clear_cache();
double estart = now();
for (int r = 0; r < encode_rounds; r++) {
encode_chunks(K, N, L, test_chunks[r]);
}
double encode_time = now() - estart;
printf("Encode time: %fs %d rounds %fs/round input rate: %.3fMB/s\n", encode_time, encode_rounds, encode_time / encode_rounds, (double) K*L*encode_rounds/1024/1024/encode_time);
clear_cache();
double dstart = now();
srand(2);
for (int round = 0; round < decode_rounds; round++) {
bytearray *chunks = test_chunks[round % encode_rounds];
bytearray temp[N];
for (int i = 0; i < N; i++)
temp[i] = NULL;
for (int i = 0; i < K; i++) {
int x = 0;
do {
x = rand() % N;
} while (temp[x] != NULL);
temp[x] = chunks[x];
}
decode_chunks(K, N, L, temp);
assert(chunks_equal(K, L, chunks, temp));
for (int i = 0; i < K; i++) {
if (temp[i] != NULL && temp[i] != chunks[i])
dealloc_buf(temp[i], L);
}
}
double decode_time = now() - dstart;
printf("Decode time: %fs %d rounds %fs/round input rate: %.3fMB/s\n", decode_time, decode_rounds, decode_time / decode_rounds, (double) K*L*decode_rounds/1024/1024/decode_time);
printf("random_%d_%d_%d succeeded\n", K, N, L);
for (int r = 0; r < encode_rounds; r++) {
for (int i = 0; i < N; i++)
dealloc_buf(test_chunks[r][i], L);
dealloc_buf(test_chunks[r], N*sizeof(bytearray));
}
dealloc_buf(test_chunks, encode_rounds*sizeof(bytearray*));
}
void start_comb(const int K, int *v) {
for (int i = 0; i < K; i++)
v[i] = i;
}
int next_comb(const int K, const int N, int *v) {
int x = K-1;
while (x >= 0) {
if (v[x] < N-1) {
v[x] += 1;
for (int j = x+1; j < K; j++)
v[j] = v[j-1] + 1;
if (v[K-1] < N)
return 0;
}
x -= 1;
}
return 1;
}
void test_all(const int K, const int N, const int L) {
bytearray chunks[N];
gen_data(K, N, L, chunks);
clear_cache();
double estart = now();
encode_chunks(K, N, L, chunks);
double encode_time = now() - estart;
printf("Encode time: %fs input rate: %.3fMB/s\n", encode_time, (double) K*L/1024/1024/encode_time);
clear_cache();
double dstart = now();
int valid[K];
int count = 0;
int done = 0;
start_comb(K, valid);
while (!done) {
bytearray temp[N];
for (int i = 0; i < N; i++)
temp[i] = NULL;
for (int i = 0; i < K; i++) {
temp[valid[i]] = chunks[valid[i]];
}
count += 1;
decode_chunks(K, N, L, temp);
assert(chunks_equal(K, L, chunks, temp));
done = next_comb(K, N, valid);
for (int i = 0; i < K; i++) {
if (temp[i] != NULL && temp[i] != chunks[i])
dealloc_buf(temp[i], L);
}
}
double decode_time = now() - dstart;
printf("Decode time: %fs %d combinations %fs/combination input rate: %.3fMB/s\n", decode_time, count, decode_time / count, (double) K*L*count/1024/1024/decode_time);
printf("Encode time plus decode time: %f\n", encode_time + decode_time);
printf("all_%d_%d_%d %d combinations succeeded\n", K, N, L, count);
for (int i = 0; i < N; i++)
dealloc_buf(chunks[i], L);
}
int main() {
gf256_init_tables();
setup_temp();
test_random(16, 24, 65536, 100, 100);
test_random(16, 24, 1024*1024, 100, 10);
test_random(17, 20, 64*1024*1024, 4, 1);
test_random(17, 20, 1024*1024, 100, 100);
test_random(17, 20, 200*1024, 100, 100);
test_all(17, 20, 1024);
test_all(10, 16, 1024);
test_all(16, 24, 16);
test_all(16, 24, 1024);
return 0;
}
Performance
K=16, N=24, L=64KB (1MB) encoding time 1.7s. Input rate: 0.58 MB/s.
K=16, N=24, L=1MB (16MB) encoding time 28s. Input rate: 0.57 MB/s.
K=17, N=20, L=200KB (3.3MB) encoding time 2.1s. Input rate: 1.58 MB/s.
test_all(10, 16, 1024) # 8008 combinations. 10KB input. 1min.
test_all(16, 24, 16) # 735471 combinations. 256B input. 9.5min.
test_all(16, 24, 1024) # 735471 combinations. 16KB input. 5hours.
K=16, N=24, L=64KB (1MB) encoding time 0.0016s. Input rate: 625 MB/s.
K=16, N=24, L=1MB (16MB) encoding time 0.03s. Input rate: 510 MB/s.
test_all(10, 16, 1024) # 8008 combinations. 10KB input. 0.058s.
test_all(16, 24, 16) # 735471 combinations. 256B input. 1.47s.
test_all(16, 24, 1024) # 735471 combinations. 16KB input. 12.3s.
K=16, N=24, L=1MB (16MB) encoding time 0.063s. Input rate: 255 MB/s.
test_all(10, 16, 1024) # 8008 combinations. 10KB input. 0.12s.
test_all(16, 24, 16) # 735471 combinations. 256B input. 3.3s.
test_all(16, 24, 1024) # 735471 combinations. 16KB input. 23.9s.
Conclusion
Here, we are only using Reed Solomon coding for handling missing data (erasure). We can use a separate checksum per chunk for error detection and treat error chunks as missing data. It is simple and cheap to add per chunk checksum. Used in this way, it can handle N-K missing or error chunks. It achieves the same results but skips the complexity.
Simple. Maintainable. Fast.
It has been a lot of fun writing this code.
See:
https://en.wikipedia.org/wiki/Reed–Solomon_error_correction
https://research.swtch.com/field
https://www.cs.cmu.edu/~guyb/realworld/reedsolomon/reed_solomon_codes.html.

