E. Generating Medium Data (1 TB+)
Don't worry: you don't need to download terabytes of data from us to reproduce the Medium Data experiments. We used this C code to generate medium1.csv
. It creates a 7 billion row by 5 column CSV file with a header.
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char **argv) {
long num_cols = 5;
long num_rows = 7000000000;
long row, col;
FILE *f = fopen("medium1.csv", "w");
fprintf(f, "col1");
for (col = 1; col < num_cols; col++) {
fprintf(f, ",col%lu", (col+1));
}
fprintf(f, "\n");
for (row = 0; row < num_rows; row++) {
fprintf(f, "%.22e", drand48());
for (col = 1; col < num_cols; col++) {
fprintf(f, ",%.22e", drand48());
}
fprintf(f, "\n");
}
fclose(f);
return 0;
}
On Linux, compile with:
gcc -O3 make_medium1.c -o make_medium1
and then run it with:
./make_medium1
It's painfully slow because it's sequential.
We used slightly different code to generate medium2.csv
:
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char **argv) {
long num_cols = 5;
long num_rows = 35000000000;
long row, col;
FILE *f = fopen("medium2.csv", "w");
fprintf(f, "col1");
for (col = 1; col < num_cols; col++) {
fprintf(f, ",col%lu", (col+1));
}
fprintf(f, "\n");
for (row = 0; row < num_rows; row++) {
if (row % 1000000000 == 0) {
printf("%lu\n", row);
}
fprintf(f, "%.22e", drand48());
for (col = 1; col < num_cols; col++) {
fprintf(f, ",%.22e", drand48()+col);
}
fprintf(f, "\n");
}
fclose(f);
return 0;
}
This program creates a 35 billion row by 5 column CSV file (~5 TB) with a header. It adds i
to each column. The mean of the i
'th column should be roughly i+0.5
. It takes about a day to finish.
Note: according to the Linux manual page, the seed is 1
by default.