// ----------------------------------------------------------------------------
// graph_color.cpp 
//
// Beginning of Ja Ja 3-coloring algorithm on shared-memory SMP.
// pfh
//
// Note: Tabs at 4, indent style BSD
//
// To print out nicely, enscript -2rG --pretty-print -T4
//  However, enscript still does not get the tabs correct, sorry.
//
// Change log:
// 7/99 Re-write after NT ate the previous version in a blue screen crash.
// MS sucks.
// Now back to coding under linux.
//
// 9/99 Ported to DEC unix. Only had to add the RAND_MAX hack. 
// See notes in header file for details.
// 3 cheers for portable code!
//
// As of 8/99, serial version working and verified. Now to parallelize
// this bad boy.
//
// As of 10/10/99, correct on 256 or fewer points, all routines parallel.
// Removing some of the now-unused debug code, etc.
// Still need a bit of code to time the actual algorithm running.
//
// 10-12-99 or so, all code working, added loops to run par/serial code
// N times and time the results.
// 
// 10-17-99 Rewrote serial version to near optimal, added std. dev. calc.
// from numerical recipes, timings on Alpha 8400 show parallel code never
// faster...oops.
//
// 10-19-99 Incorporating DAB improvements to see if we can speed it up.
// Also fixed N==1 sigma bug.
// Machine is pretty busy, load > 6, but around 4M the parallel wins now!
//
// ----------------------------------------------------------------------------

// Defining this disables status printouts and assert macros
#define NDEBUG


#include "graph_color.h"


// ----------------------------------------------------------------------------
// Util fn for debugging, flushes stdout after each.
// Disabled if NDEBUG defined, which also disables assert macros.
void debug_print(const char *str, THREADED)
{

#if !defined(NDEBUG)

    on_one_thread
    {
	printf("%s", (char *) str);
	fflush(stdout);
    }
#endif


    return;
}


// ----------------------------------------------------------------------------
// DA Bader's SMP counting sort, modified to sort structs instead of
// just ints. 
// 10-9-99 Modifying this to return the final histogram - need it
// for the final pardo in the Ja Ja algorithm.
// Obscure commentary: You Are Not Expected to Understand This.

/****************************************************/
/* R (range)      must be a multiple of SMPS */
/* q (elems/proc) must be a multiple of SMPS */
/****************************************************/
void all_countsort_smp(int q,
		       s_struct *lKey,
		       s_struct *lSorted,
		       int *pdf,
		       int R,
		       int bitOff, 
		       int m,
		       THREADED)
{
    register int
	i,
	j,
	k,
        last, temp,
	offset;
    
    int *myHisto,
        *mhp,
        *mps,
        *psHisto,
        *allHisto,
	*ptr;

    myHisto  = (int *)node_malloc(THREADS*R*sizeof(int), TH);
    psHisto  = (int *)node_malloc(THREADS*R*sizeof(int), TH);

    mhp = myHisto + MYTHREAD*R;

    for (k=0 ; k<R ; k++)
	mhp[k] = 0;
    
    pardo(k, 0, q, 1)
	mhp[bits(lKey[k].c,bitOff,m)]++;

    node_Barrier();

    pardo(k, 0, R, 1) 
	{
	    last = psHisto[k] = myHisto[k];
	    for (j=1 ; j<THREADS ; j++) 
	    {
		temp = psHisto[j*R + k] = last + myHisto[j*R +  k];
		last = temp;
	    }
	}

    allHisto = psHisto+(THREADS-1)*R;
    
    node_Barrier();

    offset = 0;

    mps = psHisto + (MYTHREAD*R);
    for (k=0 ; k<R ; k++) 
    {
	mhp[k]  = (mps[k] - mhp[k]) + offset;
	offset += allHisto[k];
    }
    
    // pfh - as per discussions with DAB, at this point we can
    // score the final PDF. Use pardo to copy it to a smaller
    // array that we will pass back to the caller.

    // This j term is constant; have all threads compute it before they
    // run the loop. Moved the barrier just above to after this 
    // calculation.

    // As per 10-19, replaced with DAB's tweaked version
    ptr = myHisto + (THREADS - 1)*R;
    node_Barrier();

    pardo(i, 0, R, 1)
	pdf[i] = *(ptr + i);

    node_Barrier();

    // Back to stock code
    pardo(k, 0, q, 1) 
	{
	    j = bits(lKey[k].c,bitOff,m);

	    /* 
	       pfh mod - Replace shallow with deep copy 

	       Original code: 
	       lSorted[mhp[j]] = lKey[k];

	    */
	    lSorted[mhp[j]].c = lKey[k].c;
	    lSorted[mhp[j]].v = lKey[k].v;
	    mhp[j]++;
	}

    node_Barrier();

    node_free(psHisto, TH);
    node_free(myHisto, TH);
}

// ----------------------------------------------------------------------------
// Simple debug routine to print a graph, inorder or sequential or both.
void print_graph(bool inorder, bool sequential, const int *S, const int num_pts)
{
    int i, current;

    if(inorder)
    {
	// Start at first node in array
	current = 1;

	printf("\nInorder traversal: \n");
	for(i = 1; i <= num_pts + 1; i++)
	{
	    printf("%d->", S[current]);
	    current = S[current];
	}
    }

    if(sequential)
    {
	printf("\nSequential listing:\n");

	for(i = 1; i <= num_pts; i++)
	    printf("%d ", S[i]);
    }

    printf("\nEnd of graph listing.\n");
    return;
}

// ----------------------------------------------------------------------------
// Expanded version, prints colors & predecessors as well
void print_full_graph(const int *S, const int *P, const int *C, const int num_pts)
{
    int cur_vertex;
    int i;


    cur_vertex = 1;

    cur_vertex = P[S[cur_vertex]];

    printf("Complete graph:\n(Pred)<-[ curr ]->(succ) color\n\n");

    for(i = 1; i <= num_pts; i++)
    {
	printf("(%2d)<-[%2d]->(%2d), color %2d\n",
	       P[cur_vertex], cur_vertex,S[cur_vertex], C[cur_vertex]);

	cur_vertex = S[cur_vertex];
    }

    printf("\nEnd of graph listing.\n");
    return;
}

// ----------------------------------------------------------------------------
// Test function - count the unique colors in a given colormap
// as a simple check of algorithm efficacy.
// To do: Check predecessor/successor colors also.
// Note that this routine starts at array location zero, since the
// colormaps use zero also.
int count_colors(const int *colors, const int num_pts)
{
    int total_used = 0;
    int *ref_cnt = NULL;
    int i;


    ref_cnt = (int *) malloc(sizeof(int) * (num_pts + 1));
    assert(ref_cnt != NULL);

    for(i = 0; i <= num_pts; i++)
	ref_cnt[i] = 0;

    // Walk the color list, carefully incrementing ref counts
    for(i = 1; i <= num_pts; i++)
    {
	assert(colors[i] >= 0);
	assert(colors[i] <= num_pts);

	ref_cnt[colors[i]]++;
    }

    // Compute total used
    for(i = 0; i <= num_pts; i++)
    {
	if(ref_cnt[i] > 0)
	    total_used++;
    }

    free(ref_cnt);

    return(total_used);
}

// ----------------------------------------------------------------------------
// Function from D Bader to permute a given array. 
// This does an in-place modification of the argument, and expects an array
// of n+1 points from 0 to n.
void permute_array(int *S, const int num_pts)
{
    int		i, rnum, itmp;
    double  dtmp;
	
    /* 
       Seed the random-number generator with current time so that
       the numbers will be different every time we run.
       Code snipped from MSDN.

       Per DAB, use srandom and random instead of rand/srand. Not in 
       C standard, but better PRNG.
    */

    // srandom should return zero if init'd ok
    assert((srandom(time(NULL)) == 0));

    for(i = num_pts - 1; i >= 2; i--)
    {
	// Get random number from 0 to 1; map to [1, i]
	dtmp = (double) random() / RAND_MAX;
	dtmp = (dtmp * i) + 1.0;
	rnum = (int) floor(dtmp);

	assert(rnum >= 1);
	assert(rnum <= num_pts);

	// Swap s[i], s[j]
	itmp = S[i];
	S[i] = S[rnum];
	S[rnum] = itmp;
    }

    return;
}

// ----------------------------------------------------------------------------
// Generate a graph, in the format of Ja Ja's successor array.
// Caller responsible for freeing the returned vector.
int *generate_array(const int num_pts, THREADED)
{
    int	*list = NULL;
    int *result = NULL;
    int i;

    list = (int *) node_malloc(sizeof(int) * (num_pts + 2), TH);
    result = (int *) node_malloc(sizeof(int) * (num_pts + 1), TH);

    on_one_thread
    {
	for(i = 0; i <= num_pts; i++)
	{
	    list[i] = i;
	    result[i] = 1;
	}
	
	// Permute the array
	permute_array(list, num_pts + 1);
	
	// Pad last entry with first - wraparound padding
	list[num_pts + 1] = list[1];
	
	// Algorithm, courtesy of Chris Hurlburt, to convert 
	// the permuted vertex list into a valid successor array.
	for(i = 1; i <= num_pts; i++)
	    result[list[i]] = list[i + 1];
    }
    
    // Free permuted vector listing
    node_free(list, TH);
    
    // Return S vector
    return(result);
}

// ----------------------------------------------------------------------------
// Function to generate a predecessor list from an S-list.
// As allways, caller frees returned list.
// Serial algorithm is link dragging, order N.
// Parallel version uses pardo for N/P, but the memory references
// are so randomized as to parallelize poorly...or so I suspect. 
// This could stand some investigation.
int *S_to_P(const int *S, int *P, const int num_pts, THREADED)
{
    int i;
    
    node_Barrier();

    pardo(i, 1, (num_pts + 1), 1)
	P[S[i]] = i;

    node_Barrier();

    return;
}

// ----------------------------------------------------------------------------
// Function to return the LSB where two integers differ.
// Assumes 8-bit bytes, does not assume ints are any particular size, 
// can probably be made faster - this is linear in the
// number of bits. Returns -1 if numbers are equal.
int lsb_diff(const int in_1, const int in_2)
{
    const int num_bits = sizeof(int) * 8;

    int tmp1, tmp2;
    int	xor;
    int i;

    // Suggestion from DAB - do XOR outside loop
    xor = (in_1 ^ in_2);

    for(i = 0; i < num_bits; i++)
    {
	// Shift right so we can see LSB of interest
	tmp1 = (xor >> i);

	// Mask off all except lowest bit
	tmp2 = (tmp1 & 0x1);

	if(tmp2 == 1)
	    return(i);
    }

    // If got here, all bits equal. Gack.
    return(-1);
}

// ----------------------------------------------------------------------------
// Util function, picks the color for a vertex from the rules of Ja Ja, 
// page 80
// Only needs to know the colors of its neighbors, pass as const
// 10-19-99, new version optimized by DAB for speed, a little less readable
// but much less work required.
INLINE int color_vertex(const int pred, const int succ)
{
    int retval;

    if((pred + succ) == 1)
	retval = 2;
    else
	retval = ((!pred) || (!succ));

    assert(retval >= 0);
    assert(retval <= 2);
    assert(retval != pred);
    assert(retval != succ);

    return(retval);
}

// ----------------------------------------------------------------------------
// Verify a colormap by inorder traversal
// False for bad graph, true for good. Does not need to check both P/S arrays, 
// but does so anyway to catch moron errors.
bool verify_colormap(const int *S, const int *P, const int *C, const int num_pts)
{
    int current = 1;
    int i;

    for(i = 1; i <= num_pts; i++)
    {
	if((C[P[current]] == C[current]) || (C[S[current]] == C[current]))
	    return(false);
	
	current = S[current];
    }

    return(true);
}
    
// ----------------------------------------------------------------------------
// Serial version
// As above, caller must free the returned array.
int *serial_three_color(const int *S, const int num_pts)
{
    int *init_colors = NULL;
    int idx;
    int current_vertex = 1;
    
    // Allocate memory
    init_colors = (int *) malloc(sizeof(int) * (num_pts + 1));
    assert(init_colors != NULL);

    // -------
    // Coloring algorithm from JaJa pg75, my mod follows.
    for(idx = 1; idx < num_pts; idx++)
    {
	init_colors[current_vertex] = (idx % 2);
	current_vertex = S[current_vertex];
    }

    // My mod - color the last point 2, requires no decisions
    init_colors[current_vertex] = 2;
    
    return(init_colors);
}

// ----------------------------------------------------------------------------
// Algorithm 2.10 from ja ja for the 3-coloring of a simple cycle.
// As above, caller must free the returned array.
// Assumes eight-bit bytes.
void three_color(const int *S, int *second_colors, const int num_pts, THREADED)
{
    // Node-malloc'd memory
    int *init_colors = NULL;
    int *pdf = NULL;

    int *P = NULL;
    s_struct *bag_in = NULL;
    s_struct *bag_out = NULL;

    // Auto (stack) variables
    int cur_color, num_elements, start_idx, end_idx;
    int i, idx, k;
    int up_limit;
    // As of 10-19, these are local/auto to each thread.
    int cur_vertex,
	pred, 
	succ;


    // Allocate memory for colormaps
    init_colors = (int *) node_malloc(sizeof(int) * (num_pts + 1), TH);
    second_colors = (int *) node_malloc(sizeof(int) * (num_pts + 1), TH);
    
    // Predecessor array
    P = (int *) node_malloc(sizeof(int) * (num_pts + 1), TH);

    // Sort structs - 2-ints, vertex & color
    bag_in = (s_struct *) node_malloc(sizeof(s_struct) * (num_pts + 1), TH);
    bag_out = (s_struct *) node_malloc(sizeof(s_struct) * (num_pts + 1), TH);

    // 256 ints, holding PDF of sorted color array
    pdf = (int *) node_malloc(sizeof(int) * NUM_HBINS, TH);

    // ---------------------------------------------------
    // Step one: Init each color to its index
    debug_print("\nStep 1: Init the colormap.", TH);

    node_Barrier();

    pardo(i, 1, (num_pts + 1), 1)
	init_colors[i] = i;

    // ---------------------------------------------------
    // Step two: Compute the reduced colormap
    debug_print("\nStep 2: Applying algorithm 2.9...", TH);

    // Run the Ja Ja algorithm 2.9 to reduce the colormap
    node_Barrier();

    pardo(i, 1, (num_pts + 1), 1)
    {
	k = lsb_diff(init_colors[i], init_colors[S[i]]);

	second_colors[i] = (k << 1) + kth_lsb(init_colors[i], k);
    }

    node_Barrier();

    // Done with the initial colormap
    node_free(init_colors, TH);

// If NDEBUG defined, skip checks and printouts
#if !defined(NDEBUG)


    // DDT run diags on one CPU
    on_one_thread
    {
	printf("done. Now %d colors.",
	       count_colors(second_colors, num_pts));

	if(num_pts <= SIZE_TOOBIG)
	{
	    printf("\nColormap after 2.9: ");
	    for(i = 1; i <= num_pts; i++)
		printf("%d ", second_colors[i]);
	    
	    printf("\n");
	}
	
    }
#endif

	
    // ---------------------------------------------------
    // Ancillary step - generate predecessor array
    debug_print("\nStep 3a: Generating P array...", TH);

    node_Barrier();
    S_to_P(S, P, num_pts, TH);

    // Now, step 3 of Ja Ja 2.10 -- sort graph by colors
    // First, copy data into vector of new structs.
    debug_print("\nStep 3b: Copying data into struct via pardo...", TH);

    node_Barrier();
    pardo(i, 1, (num_pts+1), 1)
    {
	bag_in[i].c = second_colors[i];
	bag_in[i].v = i;
    }

    // ---------------------------------------------------    
    debug_print("\nStep 3c: Sorting graph by color... ", TH);

    /*
      Use DAB's sort - since we have up_limit as the max color, we
      only need to sort the LS byte - factor of eight speedup!
      Once this is debugged, we can lower the bound even more - 
      we only need ~40 colors, so could lower the bound to 64.

      Note the pointer arithmetic: sorting code uses 0 to n-1,
      so send it base+1 as starting point.
    */
    node_Barrier();

    all_countsort_smp(num_pts, bag_in+1, bag_out+1, pdf,
		      NUM_HBINS,  0, NUM_HBITS, TH);

    // Max color that is possible from step 2.9
    up_limit = 2 * (int) ceil((log( (double) num_pts) / log(2.0)));

    // Done with input list, free soonest
    node_free(bag_in, TH);

    // ---------------------------------------------------
    // Step 4 begins
    debug_print("\nStep 4: 3-coloring (Alg 2.10)...", TH);

    // Set current color to starting color
    cur_color = 3;

    // Main loop - pick a color, recolor all nodes present
    while(cur_color <= up_limit)
    {
        // Extract start/stop indices from PDF
	start_idx = pdf[cur_color - 1] + 1;
	end_idx = pdf[cur_color];
	num_elements = (end_idx - start_idx) + 1;

	// Make sure all nodes sync'd w/correct range
	node_Barrier();

#if !defined(NDEBUG)

	// debug printout if graph small enough
	on_one_thread
	{
	    if(num_pts <= SIZE_TOOBIG)
		printf("\nColor: %2d start_idx: %d end_idx: %d elements: %d",
		       cur_color, start_idx, end_idx, num_elements);

	    if((num_elements <= 0) && (num_pts <= SIZE_TOOBIG))
		printf(" - skipping.");

	    fflush(stdout);
	}
#endif

	// Test if this color isn't present
	if(num_elements <= 0)
	{
	    cur_color++;
	    continue;
	}

	// Ending condition - out of vertices to process
	if(start_idx > num_pts)
	    break;

	// Loop over all vertices of this color, in parallel, 
	// and recolor them.
	pardo(idx, start_idx, (end_idx+1), 1)
	{
	    // Look up each threads' vertex
	    cur_vertex = bag_out[idx].v;

	    // From there, get the neighbors
	    // This is cache hell, but it is read-only
	    pred = second_colors[P[cur_vertex]];
	    succ = second_colors[S[cur_vertex]];

	    // Use these to color the current vertex
	    // This is worse, cache-wise, and all writing
	    second_colors[cur_vertex] = color_vertex(pred, succ);
	}

	// Go to next color
	cur_color++;
    }

    // Threads sync here after breaking out of the above while loop
    node_Barrier();

// If NDEBUG defined, skip all checks
#if !defined(NDEBUG)


    // Do graph diagnostics on cpu zero only
    on_one_thread
    {
	debug_print("Done!\n\n", TH);
	
	i = count_colors(second_colors, num_pts);
	
	printf("%d colors after algorithm 2.10 - ", 
	       count_colors(second_colors, num_pts), i);
	
	if(i > 3)
	{
	    printf("Error: should have 3 colors!\n", i);

	    printf("Last vertex is colored %d\n\n", second_colors[num_pts]);
	}
	else
	    printf("Correct.\n");
	
	
	if(num_pts <= SIZE_TOOBIG)
	    print_full_graph(S, P, second_colors, num_pts);

	debug_print("\nVerifying colormap...", TH);

	if(verify_colormap(S, P, second_colors, num_pts) == false)
	    debug_print("Error in colormap!!\n", TH);
	else
	    debug_print("OK\n", TH);
    }
#endif


    // Housecleaning
    node_free(pdf, TH);
    node_free(P, TH);
    node_free(bag_out, TH);

    // Make sure we're all done
    node_Barrier();
    
    return;
}

// ----------------------------------------------------------------------------
/* Script function - run the algorithm at a given size for a given
   number of iterations, and return results & stats for same.
*/
void run_the_code(const int num_pts, const int num_iterations, 
		  double *min_time,
		  double *max_time,
		  double *avg_time,
		  double *sigma,
		  bool do_serial,
		  THREADED)
{
    int idx;
    double start_time, run_time, ttl_time;
    int *data_set = NULL;
    int *color_map = NULL;
    double *run_times = NULL;
    double dtmp, sum;

    ttl_time = 0.0;

    // Set min, max to values such that will be replaced on the first run
    *min_time = (double) num_pts * 10.0;
    *max_time = -0.01;

    // Allocate memory to hold runtimes - don't know how much
    // we will need until runtime.
    on_one_thread
    {
	run_times = (double *) malloc(sizeof(double) * num_iterations);
	assert(run_times != NULL);
    }

    for(idx = 0; idx < num_iterations; idx++)
    {
	// Generating the data set is not counted in the runtime.
	data_set = generate_array(num_pts, TH);

	// Do we want serial or parallel version?
	if(do_serial == true)
	{
	    on_one_thread
	    {
		start_time = get_seconds();

		color_map = serial_three_color(data_set, num_pts);

		run_time = get_seconds() - start_time;
	    }
	}
	else // Parallel code
	{
	    all_Barrier(TH);

	    start_time = get_seconds();
	
	    three_color(data_set, color_map, num_pts, TH);
	}

	run_time = get_seconds() - start_time;

	// Save time for stats later
	on_one_thread
	    run_times[idx] = run_time;

        // Update min, max, total times - do these on one thread.
	on_one_thread
	{
	    if(run_time < *min_time)
		*min_time = run_time;
	    
	    if(run_time > *max_time)
		*max_time = run_time;
	    
	    ttl_time += run_time;
	}

	// Housecleaning
	node_free(data_set, TH);

	// Serial code uses malloc, so different cleanup
	if(do_serial == true)
	    on_one_thread
		free(color_map);
	else
	    node_free(color_map, TH);
    }

    // Compute stats
    on_one_thread
    {
	// Mean
	*avg_time = (double) (ttl_time / num_iterations);

	// Compute std deviation
	sum = 0.0;

	for(idx = 0; idx < num_iterations; idx++)
	{
	    dtmp = (run_times[idx] - *avg_time);
	    sum += (dtmp * dtmp);
	}

	// Sigma undefined for N==1
	if(num_iterations > 1)
	    *sigma = sqrt(sum) * (1.0 / (num_iterations - 1.0));
	else
	    *sigma = *avg_time;

	free(run_times);
    }

    all_Barrier(TH);
    
    return;
}

// ----------------------------------------------------------------------------
// Main - create data, run the algorithm, save the stats.
void *SIMPLE_main(THREADED)
{
    const int min_size = (1 << 16);
    const int max_size = (1 << 23);
    const int num_iterations = 3;

    int cur_size;
    int idx;
    double ser_avg, par_avg, min, max, sig;


    on_one_thread
    {
	printf("\nParallel and serial three-coloring.");
	printf("\nMin size: %d Max size: %d Iterations per size: %d\n\n",
	       min_size, max_size, num_iterations);

	printf("P/S Size      Minimum Maximum Average Sigma  Ser/Par");
	fflush(stdout);
    }

    for(idx = 0, cur_size = min_size; 
	cur_size <= max_size;
	idx++, cur_size *= 2)
    {
	// Serial first
	run_the_code(cur_size,
		     num_iterations,
		     &min, &max, &ser_avg, &sig, true, TH);

	on_one_thread
	{
	    printf("\nSer %8d %7.4f %7.4f %7.4f %7.4f",
		   cur_size,
		   min,
		   max,
		   ser_avg,
		   sig);
	    fflush(stdout);
	}

	// Parallel timings
	run_the_code(cur_size,
		     num_iterations,
		     &min, &max, &par_avg, &sig, false, TH);

	on_one_thread
	{
	    // Div by zero if size too small - oops!
	    if(ser_avg < 0.0001)
		ser_avg = 0.0001;

	    printf("\nPar %8d %7.4f %7.4f %7.4f %7.4f %8.4f",
		   cur_size,
		   min,
		   max,
		   par_avg,
		   sig,
		   par_avg / ser_avg);
	    fflush(stdout);
	}
    }
		     
    on_one_thread
	printf("\n\n");

    return NULL;
}


syntax highlighted by Code2HTML, v. 0.8.8b