E. Generating Medium Data (1 TB+)

Don't worry: you don't need to download terabytes of data from us to reproduce the Medium Data experiments. We used this C code to generate medium1.csv. It creates a 7 billion row by 5 column CSV file with a header.

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char **argv) {
  long num_cols = 5;
  long num_rows = 7000000000;
  long row, col;
  FILE *f = fopen("medium1.csv", "w");
  fprintf(f, "col1");
  for (col = 1; col < num_cols; col++) {
    fprintf(f, ",col%lu", (col+1));
  }
  fprintf(f, "\n");
  for (row = 0; row < num_rows; row++) {
    fprintf(f, "%.22e", drand48());
    for (col = 1; col < num_cols; col++) {
      fprintf(f, ",%.22e", drand48());
    }
    fprintf(f, "\n");
  }
  fclose(f);
  return 0;
}

On Linux, compile with:

gcc -O3 make_medium1.c -o make_medium1

and then run it with:

./make_medium1

It's painfully slow because it's sequential.

We used slightly different code to generate medium2.csv:

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char **argv) {
  long num_cols = 5;
  long num_rows = 35000000000;
  long row, col;
  FILE *f = fopen("medium2.csv", "w");
  fprintf(f, "col1");
  for (col = 1; col < num_cols; col++) {
    fprintf(f, ",col%lu", (col+1));
  }
  fprintf(f, "\n");
  for (row = 0; row < num_rows; row++) {
    if (row % 1000000000 == 0) {
      printf("%lu\n", row);
    }
    fprintf(f, "%.22e", drand48());
    for (col = 1; col < num_cols; col++) {
      fprintf(f, ",%.22e", drand48()+col);
    }
    fprintf(f, "\n");
  }
  fclose(f);
  return 0;
}

This program creates a 35 billion row by 5 column CSV file (~5 TB) with a header. It adds i to each column. The mean of the i'th column should be roughly i+0.5. It takes about a day to finish.

Note: according to the Linux manual page, the seed is 1 by default.

results matching ""

    No results matching ""