OpenMPI - Pi approximation via arctan()

Using point-to-point communication (MPI_Send() and MPI_Recv())

/*
| Pi approximation, based on the derivative of arctan(x) evaluated over [0..1]
| ( a.k.a. integral of 1/(1+x*x) dx )
| 2019-11-17
*/
#include <stdio.h>
#include <stdlib.h> // strtod()
#include <time.h>   // clock_gettime(), clock()
#include <math.h>   // log10()
#include <mpi.h>

// long double only suppports 35 digits
#define PI35 3.1415926535897932384626433832795029L
//           -.---|----|----|----|----|----|----|


long double integral(long double start, long double stop, long double dx)
{
    long double sum = 0.0L;
    for (long double x = start; x <= stop; x += dx) {
        long double term = dx / (1.0L + x*x);
        sum += term;
    }
    return 4.0L * sum;
}


int main(int argc, char **argv)
{
    if (argc < 2) {
        fprintf(stderr, "usage: %s  <nterms>\n", argv[0]);
        fprintf(stderr, "    <nterms>: hopefully-large number\n");
        return 1;
    }

    long double nterms = strtod(argv[1], NULL);
    long double dx = 1.0L / nterms;

    //------------------------------------------------------------------
    // Get the starting time with sub-second precision...
    struct timespec walltime[2];
    clock_gettime(CLOCK_REALTIME, (walltime+0));
    //------------------------------------------------------------------

    MPI_Init(&argc, &argv); // args not needed, but supplied anyway.
    int world_size, processor_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &processor_rank);
    int name_len;
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(processor_name, &name_len);

    long double range = 1.0L / world_size;
    long double start = processor_rank * range;
    long double stop = (processor_rank + 1) * range;

    //------------------------------------------------------------------
    // Sum up the differentials, and calculate pi:

    // Everybody computes:
    long double pi_piece = integral(start, stop, dx);   // 4.0 factor in the ftn...

    int tag = 0;
    if (processor_rank != 0) {  // not root process
        MPI_Send(&pi_piece, 1, MPI_LONG_DOUBLE, 0, tag, MPI_COMM_WORLD);

    } else {    // root process
        long double pi_approx = pi_piece;
        for (int rank = 1; rank < world_size; rank++) {
            MPI_Recv(&pi_piece, 1, MPI_LONG_DOUBLE, rank, tag,
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            pi_approx += pi_piece;
        }

        //------------------------------------------------------------------
        // Report the calculation results:
        long double error = pi_approx - PI35;
        if (error < 0.0L)
            error = -error;
        int width = abs(log10(error)) + 2;  // show this many digits...
        printf("#----\npi35bits: %.*Lf\n  approx: %.*Lf\nerror=%.3Le\n#----\n",
                width, PI35, width, pi_approx, error);

    }

    MPI_Finalize();

    //------------------------------------------------------------------
    // Report the running time with sub-second precision:
    clock_gettime(CLOCK_REALTIME, (walltime+1));
    struct timespec deltat;
    deltat.tv_sec = walltime[1].tv_sec - walltime[0].tv_sec;
    deltat.tv_nsec = walltime[1].tv_nsec - walltime[0].tv_nsec;
    if (deltat.tv_nsec < 0) {
        deltat.tv_sec--;
        deltat.tv_nsec += 1000000000L ;
    }
    struct timespec rest;
    clock_getres(CLOCK_REALTIME, &rest);
    printf("process %s:%02i wallclock: %lu.%09ld seconds\t(resolution: %ld ns)\n",
            processor_name, processor_rank, deltat.tv_sec, deltat.tv_nsec, rest.tv_nsec);
    //------------------------------------------------------------------

    return 0;
}