/*
 * 				Mark Gao, Apr, 2016
 */

#include <stdio.h>
#include <stdarg.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <ctype.h>
#include <getopt.h>
#include <errno.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#include <regex.h>
#define DBG(msg) fprintf(stderr, "# FFL(%s, %s, %d): %s\n", __FILE__, __func__, __LINE__, msg)
#define LOG(fp, msg) fprintf(fp, "# FFL(%s, %s, %d): %s\n", __FILE__, __func__, __LINE__, msg)
#define matrix_det_sign(i, j) ( ( ((i)+(j)) % 2 == 0 ) ? 1 : -1 )
#define heap_parent(i)  ( (i) / 2)
#define heap_left(i)    ( (i) * 2)
#define heap_right(i)   ( (i) * 2 + 1)
#define num_arr_elements(arr)  sizeof(arr)/sizeof(arr[0])
#define DS_LEN        102400
#define BUF_LEN       1024
#define MAXFLOAT	  3.40282347e+38F
#define MINFLOAT	 -3.40282347e+38F
#define RAD2DEG(rad)  ( rad * 180.0 / M_PI )
#define DEG2RAD(deg)  ( deg * M_PI / 180.0 )
#define DB(f)         20.0 * log10(f)
#define IDB(db)       pow(10., db/20.0) 
#define MXESINMOP(mop) (mop)->num_matrixes
#define MOP(p)        (p)->mop
#define MXP(p)        (p)->mxp
#define MOPMXNUM(p)   (p)->num_matrixes
#define MOP2MXP(p, n) (p)->mpp[(n)]
#define MXP(p)        (p)->mxp
#define MXID(mxp)     (mxp)->id
#define MX_NONIDX     9999999
#define MXCOLS(mxp)   (mxp)->num_cols
#define MXROWS(mxp)   (mxp)->num_rows
#define MXRPP(mxp)    (mxp)->rowp
#define MXRPPSTR(mxp) (mxp)->srowppp
#define MXPDECL(mxp)  struct matrix_dsc_s *mxp
#define MOPDECL(mxp)  struct matrix_op_s *mop
#define MXOPDECL(mxp, mop)  MXPDECL(mxp); MOPDECL(mop)
//--- shortcut for declaring and init'ing some common variables in matrix ops
#define MXPARAMS(mxp, i, j, m, n, rpp, rp) 	\
		  unsigned int i, j, m, n;          \
				 double **rpp, *rp;         \
				 rpp = MXRPP(mxp);          \
				 m = MXROWS(mxp); n = MXCOLS(mxp)  

//--- shortcut for looping thru all the elements in the matrix
#define mxforeachitem(i, j, m, n, rpp, rp)              \
	for ( i = 0,  rp = rpp[i]; i < m; i++, rp = rpp[i]) \
		for ( j = 0; j < n; j++ ) 
		
#define EQ(m, n)      ((m) == (n))
#define NEQ(m, n)     ((m) == (n))
#define GT(m, n)      ((m) >  (n))
#define GE(m, n)      ((m) => (n))
#define LT(m, n)      ((m) <  (n))
#define LE(m, n)      ((m) <= (n))
#define MXROWS_EQ(mxp1, mxp2)        ((mxp1)->num_rows == (mxp2)->num_rows)
#define MXCOLS_EQ(mxp1, mxp2)        ((mxp1)->num_cols == (mxp2)->num_cols)
#define MXCOLS_EQ_ROWS(mxp1, mxp2)   ((mxp1)->num_cols == (mxp2)->num_rows )
#define MXROWS_EQ_COLS(mxp1, mxp2)   ((mxp1)->num_rows == (mxp2)->num_cols )
#define MXINDIM_EQ(mxp1, mxp2)       (MXCOLS(mxp1) == MXROWS(mxp2) )

#define MXSIZE_EQ(mxp1, mxp2)        (MXROWS_EQ(mxp1, mxp2) && MXCOLS_EQ(mxp1, mxp2) )
#define MXSIZE_EQ3(mxp1, mxp2, mxp3) (MXSIZE_EQ(mxp1, mxp2) && MXSIZE_EQ(mxp2, mxp3) && MXSIZE_EQ(mxp3, mxp1) ) 

#define MXSIZE(mxp)   do { if (mxp) sprintf(dbg, "Mx size of %-35s: %5d x %-5d", MXID(mxp), MXROWS(mxp), MXCOLS(mxp)); else strcpy(dbg, "(NULL)"); DBG(dbg);} while(0)
#define MXSIZE2(i, mxp)   do { sprintf(dbg, "%2i : Mx size of %-35s: %5d x %-5d", i, MXID(mxp), MXROWS(mxp), MXCOLS(mxp)); DBG(dbg);} while(0)

#define MATVEC_XTYPE_ROW    0
#define MATVEC_XTYPE_COL    1

#define MXNOP(mxap, mxbp)   do { \
    sprintf(dbg, "number of rows and cols does not match cannot operator on them = %s (%d, %d), %s(%d, %d)",\
    MXID(mxap), MXROWS(mxap), MXCOLS(mxap), MXID(mxbp), MXROWS(mxbp), MXCOLS(mxbp) ); } while(0)

#define mxsize(mxp)   ( MXROWS(mxp) * MXCOLS(mxp) )

#define MALLOC(n, type)   malloc((n) * sizeof(type) )
#define CALLOC(n, type)   calloc((n),  sizeof(type) )

#define ARRAYSIZE(arr)   sizeof(arr)/sizeof(arr[0])

#define  isposinf(x)  (x) ==  FP_INFINITE
#define  isneginf(x)  (x) == -FP_INFINITE
#define  iszero(x)    (x) == 0
#define  isgt0(x)     (x) >  0
#define  islt0(x)     (x) <  0
#define  isodd(x)     ( (x%2) == 1 ) ? 1 : 0
#define  iseven(x)    ( (x%2) == 1 ) ? 0 : 1

//#define sign_pla(w)   ( (w) > 0 ? 1.0 : (-1.0) )
#define sign_pla(w)   ( (w) > 0 ? 1 : (-1) )

// macro to compute the outer product of u * vT at (i, j)
#define outerprod(u, vT, i, j)  (u[i], vT[j])

#define max(a, b)   ((a) > (b) ? (a) : (b))
#define min(a, b)   ((a) < (b) ? (a) : (b))
#define CSPLINE_INTERP_NATURAL  0
#define CSPLINE_INTERP_CLAMPED  1

static char dbg[40*BUF_LEN],  msg[BUF_LEN];

struct matrix_dsc_s;

struct cspline_interp_s {
    unsigned int bound_type, cnt, num_nodes, n, k;
    double *xp, *yp, *ap, *bp, *cp, *dp, *hp, x, y; 
    double fap, fbp;  // first-order deriv of A and B endpoints, respectively
    struct matrix_op_s *mop;
};


struct matrix_op_s {
    unsigned int num_matrixes: 16,
                           op :16;

     unsigned int          ia :16,  // A matrix
                           ib :16;  // b matrix

     unsigned int          ix :10,  // x matrix
                           it :10,  // transposed matrix
                           il :10;  // L matrix in LU decomposition

     unsigned int          iu :10,  // U matrix in LU decomposition
                           iv :10,  // inverted matrix
                           id :10;  // identity matrix

    unsigned int         idup :10,  // duplicate of A
                           iq :10,  // matrix q in QR decomposition
                     jcb_iter : 1,  // 1: Jacobi iterative; 0: Gauss-Seidel iterative
                           ir :10;  // matrix r in QR decomposition

	unsigned int          igh :10,  // graph connection matrix
                          ix2 :10, 
                          iam :10,  // used in automata test
                        new_xm: 1,  // When solving A*x =b, don't create x matrix 
                          str : 1;  // matrix contains string data

    unsigned int           iy : 6,  // y matrix
                         iadj : 6,  // the adjoint matrix of A
                      blkrows : 6,  //  no. of smaller matrixes to make up the rows of the big block
                      blkcols : 6,  //  no. of smaller matrixes to make up the columns of the  big block
                           xx : 1; 
    struct matrix_dsc_s **mpp;
	unsigned int offset, num_blkrows, num_blkcols;
};

struct matrix_dsc_s {
    char *id;
    unsigned int num_rows, num_cols;
    unsigned int type:16,
                 aug : 1,
              not_li : 1,
			   no_rp : 1, // the rp storage is actually in some other matrix
			     chr : 1, // for character diagram
                 xx  : 1; // What kind of data in this matrix
    unsigned int m, *mp, n, *np;         // Added for matrix slicing operation
    double **rowp, det;
    char ***srowppp;
    void ***rowppp;  // General application
    // ------------------------------------------------------------------
    //  These fields are added to support the polynomials in the matrix.
    // ------------------------------------------------------------------
    struct polynomial_dsc_s ***mpnppp,  // For representing the matrix which contains the polynomial elements
                        *det_pnp;
    struct polynomial_op_s *pop;  // For collecting intermediate and final results
};


struct matrix_data_s {
    unsigned int x, y;
    unsigned char *sp;
};


struct util_permcomb_s {
    unsigned int perm: 1,
                 comb: 1,
                 x1  : 1,
                 xx  : 1;
    unsigned int m, n, num_rows, num_cols;
    unsigned int **gcpp;
};



/*   
 *   Command line options and user-supplied data
 *
 */
struct na_params_s {
    char *dfname, *lenfname, *rp;
    unsigned int height, width, value, signo, head, tail;
    unsigned int 
        am :    1,   bs :     1,   cg :    1,   gd  :   1,
        cmplex: 1,   calc :   1,   dsp:    1,   dts :   1, 
        de :    1,   de_na :  1,   de_eu:  1,   de_eu2: 1,
        de_plot:1,   ecg:     1,   ee :    1,   ee2 :   1,
        fxp:    1,   ft  :    1,   fft:    1,   cmp :   1,
        itp_lg: 1,   itp_spl: 1,   itp_hmt:1,   info:   1,
        mos :   1,   mc :     1,   ml :    1,   mx  :   1,
        lc :    1,   lp :     1,   poly:   1,   vec :   1;
    
    unsigned int 
        newton: 1,   graph  : 1,   heap :  1,   lsqf :  1,
        pow_ee: 1,   secant : 1,   ni :    1,   pb   :  1,
        test  : 1,   svm    : 1,   st :    1,   tm   :  1,
        ntu   : 1,   o      : 1,   cmp2 :  1,   inv  :  1,
		rt    : 6,   xx     : 1,     x1 :  1;
    unsigned int     
        gu:     1,   gu2:     1,   spug:   1,   scale:  1,
        spu:    1,   spt:     1,   li:     1,   prml :  1,
        pdf:    1,   spl:     1,   mst:    1,    spd :  1, 
        len:    1, // 1 if the length of SMT is sent to a file, specified by lenfname
        pdf_fs: 5, // font size for PDF diagram file
        coord:  1,
        dense:  1,
        dummy:  1;
	
	struct matrix_dsc_s *mxp;      // user-specific data matrix
	double dvc, eps, delta, N, C;  // for SVM C constant
};

double ran1(void);
double gen_sd_num(void);
double rand01(void);

struct na_params_s *na_parse_params(int argc, char **argv);
void run_utils(void);

void matrix_op_sub(struct matrix_op_s *mop);
void matrix_op_add(struct matrix_op_s *mop);
void matrix_op_gje(struct matrix_op_s *mop);

double matrix_det_rec(struct matrix_dsc_s *mxp, unsigned int xrn, unsigned int xcn, unsigned int *rnp, unsigned int *cnp, unsigned int rank);
void matrix_op_det(struct matrix_op_s *mop);
void matrix_op_adj(struct matrix_op_s *mop);
void matrix_op_jcb(struct matrix_op_s *mop);
void matrix_op_aug(struct matrix_op_s *mop);
void matrix_nsign(struct matrix_dsc_s *mxp);
void matrix_cofactor_ij(struct matrix_dsc_s *ijmxp, struct matrix_dsc_s *mxp, unsigned int i, unsigned int j);
unsigned int matrix_same_addsub(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp);
unsigned int matrix_equal(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp);
unsigned int matrix_rc_index_valid(struct matrix_dsc_s *mxp, unsigned int ri, unsigned int ci);
void matrix_ewop_mul(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, double alpha);
double matrix_ewop_sqrtsum(struct matrix_dsc_s *mxp);

void matrix_pn_init(struct matrix_dsc_s *mxp);
void matrix_pn_fini(struct matrix_dsc_s *mxp);
void matrix_pn_set_enval(struct matrix_dsc_s *mxp);
void matrix_pn_det(struct matrix_dsc_s *mxp);
void matrix_pn_print(struct matrix_dsc_s *mxp);

void matrix_print_str(struct matrix_dsc_s *mxp);
struct matrix_dsc_s *matrix_init_str(char *datafilep);
struct matrix_dsc_s *matrix_slice(struct matrix_dsc_s *mxp);
void matrix_slice_params(struct matrix_dsc_s *mxp, unsigned int num_rows, unsigned int num_cols);
struct matrix_dsc_s *matrix_slice_mn(struct matrix_dsc_s *mxp, 
  unsigned int m, unsigned int *mp, unsigned int n, unsigned int *np);
struct matrix_dsc_s *matrix_circular_create(unsigned int n, double *vp);
unsigned int matrix_is_row(struct matrix_dsc_s *mxp);
unsigned int matrix_is_column(struct matrix_dsc_s *mxp);
void matrix_slice_set_params(struct matrix_dsc_s *mxp, unsigned int *rdp, unsigned int *cdp);

void matrix_fill_vs(struct matrix_dsc_s *inmxp, struct matrix_dsc_s *vsmxp, unsigned int row_lo, unsigned int row_hi);
struct matrix_dsc_s *matrix_create_vs(struct matrix_dsc_s *inmxp, unsigned int row_lo, unsigned int row_hi);
void matrix_sw_rows(struct matrix_dsc_s *mxp, unsigned int *rip, unsigned int num_ris);
struct matrix_dsc_s *matrix_create_rowp_only(char *id, unsigned int num_rows, unsigned int num_cols);
struct matrix_dsc_s *matrix_create_rowp(struct matrix_op_s *mop, char *id, double *p, double **newp, unsigned int num_rows, unsigned int num_cols);
unsigned int matrix_arrays(struct matrix_op_s *mop, double *dp, unsigned int a[], unsigned int num_mxps, unsigned int g_flag);

void  matrix_load_file(struct matrix_dsc_s *mxp, char *datafilep);
void  matrix_load_one_row(unsigned int n, double *rp, char *dp);
double *matrix_one_rp(unsigned int n, char *dp);
unsigned int matrix_num_cols_per_row(char *linebufp);

//inline void matrix_set_no_rp(struct matrix_dsc_s *mxp);
void matrix_set_no_rp(struct matrix_dsc_s *mxp);
struct matrix_dsc_s *matrix_reshape(struct matrix_dsc_s *inmxp, unsigned int a, unsigned int b);
void nn_mul_abc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);
void nn_mul_aTbc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);
void nn_mul_abTc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);
int matrix_line_2ints(char *linebufp);
unsigned  int *ip_per_row(char *dp);
double  *rp_per_row(char *dp);

unsigned int matrix_basis_normal(struct matrix_dsc_s *mxp);
unsigned int matrix_basis_orthogonal(struct matrix_dsc_s *mxp);
unsigned int matrix_basis_orthonormal(struct matrix_dsc_s *mxp);

unsigned int mop2mxid_by_str(struct matrix_op_s *mop, unsigned char *id);
struct matrix_dsc_s *mop2mxp_by_str(struct matrix_op_s *mop, unsigned char *id);

void vector_print_int(unsigned int n, int *vp);
void vector_sprint_int(char *msg, unsigned int n, int *vp);
struct matrix_dsc_s *util_comb_matrix(unsigned int m, unsigned int n);
struct matrix_dsc_s *util_perm_matrix(unsigned int n);
/** compute the number of combinations C(m, n), n out m **/
unsigned int util_comb_num(unsigned int m, unsigned int n);
unsigned int util_perm_num(unsigned int n);


size_t nau_file_size(const char *fpath);
unsigned int nau_binary_file(char *datafilep);
unsigned int nau_rand_integer(unsigned int n);

struct matrix_op_s *matrix_op_init2(const unsigned char *);
double matrix_chodecomp_dotprod(struct matrix_dsc_s *lmxp, double *dp, unsigned int dn);
double matrix_chodecomp_offdiagprod(struct matrix_dsc_s *lmxp, double *dp, unsigned int rn, unsigned int cn);
void  matrix_eye2diag(struct matrix_dsc_s *mxp, double *dp);

void run_nn(struct na_params_s *p);
void run_nn_pred(struct na_params_s *p);
void run_mfcc(struct na_params_s *p);
void  dstdct(unsigned int m, double *xp, double *yp, unsigned int dctflag, unsigned int type);
void  freq_spectrum(unsigned int N, double *xrealp, double *ximagp, double *psp);
char *nau_reverse_id(char *p, char c);
unsigned int nau_num_lines(char *datafilep);


/* Utility function: read the file size in bytes. */
size_t nau_file_size(const char *fpath)
{
    struct stat s;
    size_t sz;

    if ( stat(fpath, &s) < 0 ) {
        sz = 0;
    } else {
        sz = s.st_size;
    }
    return sz;
}


/**======= Utility: check if a data file is binary or asscii  =======**/
unsigned int nau_binary_file(char *datafilep)
{
	unsigned int i, m, n, c;
	char buf[200];
	FILE *fp;

	n = sizeof(buf)/sizeof(buf[0]);
	fp = fopen(datafilep, "r");
	if ( fp ) {
		c = 0;

		//
		// for m bytes read, if any of them is not ASCII, 
		// then it is considered a binary file.
		//
		m = fread(buf, sizeof(char), n, fp);
		for ( i = 0; i < m; i++ ) {
			if (isascii(buf[i])) {
				c++;
			} 
		}
	
		fclose(fp);
	} else {
		sprintf(dbg, "cannot open file %s: errno = %d ",
			datafilep, errno); DBG(dbg); 
		exit(1);
	}

	return  c == m ? 0:1;
}

/**======= Utility: counting the number of lines in a file  =======**/
unsigned int nau_num_lines(char *datafilep)
{
	FILE *fp;
	unsigned int nl = 0;
	char buf[5120000]; // hope the size of the buffer is large enough
	fp = fopen(datafilep, "r");
	if ( fp ) {
		while ( fgets(buf, sizeof(buf), fp) ) {
			nl++;
		}
	} else {
        sprintf(dbg, "could not open data file  \"%s\". Please check.", 
            datafilep); DBG(dbg); 
	}

	return nl;
}


/** ----------------- Greatest common divisor ------------- **/
int gcd(int a, int b)
{
  	if ( a == 0 ) 
		return b;
    
	return gcd(b%a,  a);
}


/** -------------------- main() -------------------------- **/

int main(int argc, char **argv)
{
    struct na_params_s *p;
    
    p = na_parse_params(argc, argv);

    if ( p->ml ) {
		//run_nn(p);
		run_nn_pred(p);
    }

    if ( p->fft ) {
        run_mfcc(p);
	}
    
	if (p->dfname)
		free(p->dfname);

    if ( p->lenfname )
        free(p->lenfname);

	free(p);

    return 0;
}

/*
 *   All the command line arguments are parseed here.
 *   Based on the parsed results, various routines will
 *   be invoked.
 *
 */

struct na_params_s *na_parse_params(int argc, char **argv)
{
    struct na_params_s *p = CALLOC(1, struct na_params_s);
	double dvc, eps;
	int r, option_index = 0,        df = 0,
      am    = 0,    bs  = 0,      calc = 0,  cmplex = 0,
      cg    = 0,    gd  = 0,      de   = 0,   de_na = 0,
      de_eu = 0, de_eu2 = 0,   de_plot = 0,    dsp  = 0,
      dts   = 0,    ecg = 0,      ee   = 0,    ee2  = 0,       
      fxp   = 0,    ft  = 0,      fft  = 0,   graph = 0,
      lc    = 0,    lp  = 0,      lsqf = 0,    poly = 0,    
      mc    = 0,    ml  = 0,      mx   = 0,    mos  = 0,
      itp_lg= 0, itp_spl= 0,   itp_hmt = 0,      li = 0,
      ni    = 0, newton = 0,    pow_ee = 0,     pb  = 0,
      scale = 0,    vec = 0,      st   = 0,  secant = 0,  
      gu    = 0,    gu2 = 0,     dense = 0,     tm  = 0,
      test  = 0,    cmp = 0,      cmp2 = 0,    info = 0,
      spd   = 0,    spu = 0,      spl  = 0,     spt = 0,  
      spug  = 0,    svm = 0,      ntu  = 0,   value = 0,
	  mst   = 0,    pdf = 0,    pdf_fs = 0,   signo = 0,
	  head  = 0,   tail = 0,    inv    = 0,      rt = 0,
      prml  = 0,  coord = 0,     width = 0,  height = 0; 
	

    char optstr[] = "d:ugpmosz:h:w:cC:v:N:e:V:D:r:";
    struct option los[] = {
        {"am",       0, &am,      1},  // automata  
        {"bs",       0, &bs,      1},  // bisection algorithm
        {"df",       1, &df,      1},  // data file name 
        {"calc",     0, &calc,    1},  // calculus
        {"cmplex",   0, &cmplex,  1},  // complex number/analysis
        {"cg",       0, &cg,      1},  // conjugate gradient descent
        {"gd",       0, &gd,      1},  // gradient descent
        {"de",       0, &de,      1},  // differential equations
        {"de_eu",    0, &de_eu,   1},  // Euler's method
        {"de_eu2",   0, &de_eu2,  1},  // modified Euler's method
        {"de_na",    0, &de_na,   1},  // DE numerical method
        {"de_plot",  0, &de_plot, 1},  // DE plot field and solution curves
        {"dsp",      0, &dsp,     1},  // digital signal processing
        {"dts",      0, &dts,     1},  // discrete-time signal processing
        {"ecg" ,     0, &ecg,     1},  // ECG ml project
        {"rt" ,      1, &rt,      1},  // ECG run type
        {"head" ,    1, &head,    0},  // specify a range [head,
        {"tail" ,    1, &tail,    0},  // ...              tail)
        {"li" ,      0, &li,      1},  // ECG line integration
        {"cmp" ,     0, &cmp,     1},  // ECG comparison with std anno
        {"cmp2" ,    0, &cmp2,    1},  // ECG comparison with std boundary anno
        {"inv" ,     0, &inv,     1},  // ECG signal inverted 
        {"ee" ,      0, &ee,      1},  // electronic circuit
        {"ee2" ,     0, &ee2,     1},  
        {"fxp" ,     0, &fxp,     1},  // fixed point algorithm
        {"info" ,    1, &info,    1},  // print related information
        {"ft" ,      0, &ft,      1},  // Fourier transform
        {"fft" ,     0, &fft,     1},  // FFT 
        {"graph" ,   0, &graph,   1},  // graph routines
        {"lc" ,      0, &lc,      1},  // linear circuit
        {"lp" ,      0, &lp,      1},  // linear programming
        {"lsqf" ,    0, &lsqf,    1},
        {"pb",       0, &pb,      1},  // probability
        {"poly",     0, &poly,    1},  // polynomial operations
        {"pow_ee",   0, &pow_ee,  1},  // power electronics
        {"mc" ,      0, &mc,      1},  // multivariable calculus
        {"ml" ,      0, &ml,      1},  // machine learning algorithms
        {"mx" ,      0, &mx,      1},  // matrix routines
        {"mos" ,     0, &mos,     1},  // MOSFET simulation or computation
        {"itp_lg" ,  0, &itp_lg,  1},  // interpolation, Lagrange
        {"itp_spl",  0, &itp_spl, 1},  // interpolation, cubic spline
        {"itp_hmt",  0, &itp_hmt, 1},  // interpolation, Hermite
        {"ni" ,      0, &ni,      1},  // numerical integration
        {"newton" ,  0, &newton,  1},  // Newton-Raphson method
        {"scale" ,   0, &scale,   1},
        {"vec" ,     0, &vec,     1},  // vector rouintes
        {"st" ,      0, &st,      1},  // statistics inference
        {"svm" ,     0, &svm,     1},  // ML SVM classification
        {"ntu" ,     0, &ntu,     1},  // NTU ML 1 and 2
        {"prml" ,    0, &prml,    1},  // ETH Automatic Mobile Robot
        {"secant" ,  0, &secant,  1},  // secant algorithm
        {"tm" ,      0, &tm,      1},  // Turning machine
        {"test" ,    0, &test,    1},  // algorithm test
        {"signo",    1, &signo,   0},  // signal no. in ECG 
        {"spd" ,     0, &spd,     1},  // (reserved)
      //{"xx",       0,  0,       0},
        {NULL,       0,  0,       0}
    };

    while( 1 ) {
        r = getopt_long(argc, argv, optstr, los, &option_index);
        if ( r == -1 ) break;

        switch( r ) {
        case 0:
            if ( optarg ) {
                if ( !strcmp(los[option_index].name, "df") ) { 
                    p->dfname = strdup(optarg);    
                } else if ( !strcmp(los[option_index].name, "pdf_fs") ) {
                    pdf_fs = atoi(optarg);
                } else if ( !strcmp(los[option_index].name, "len") ) {
                    p->lenfname = strdup(optarg);
                    p->len = 1;
                
                } else if ( !strcmp(los[option_index].name, "signo") ) {
                    signo = atoi(optarg);
                } else if ( !strcmp(los[option_index].name, "info") ) {
					p->info = 1;
                    p->rp = optarg;
                } else if ( !strcmp(los[option_index].name, "head") ) {
					head = atoi(optarg);
                } else if ( !strcmp(los[option_index].name, "tail") ) {
					tail = atoi(optarg);
                } else if ( !strcmp(los[option_index].name, "rt") ) {
					rt  = atoi(optarg);
				} else {
				
				}
            }
        break;
        
        case 'c': coord  = 1;                      break;
        case 'd': strcpy(p->dfname, optarg);       break;
        case 'e': eps = strtod(optarg, NULL);      break;
        case 'u': spu    = 1;                      break;
        case 't': spt    = 1;                      break;
        case 'l': spl    = 1;                      break;
        case 'g': spug   = 1;                      break;
        case 'm': mst    = 1;                      break;
        case 'p': pdf    = 1;                      break;
        case 'z': pdf_fs = atoi(optarg);           break;
        case 'v': value  = atoi(optarg);           break;
        case 'V': dvc    = strtod(optarg, NULL);   break;
        case 'h': height = atoi(optarg);           break;
        case 'w': width  = atoi(optarg);           break;
        case 's': scale  = 1;                      break;
        case 'o': p->o   = 1;                      break;
        case 'r': p->rp  = optarg;                 break;
        case 'C': p->C   = strtold(optarg, NULL);  break;
        case 'N': p->N   = strtod(optarg,  NULL);  break;
        case 'D': p->delta = strtod(optarg, NULL); break;
        }
    
    }
    p->spu  = spu;     p->scale = scale;   p->mst = mst;  p->pdf = pdf;
    p->spug = spug;    p->coord = coord;   p->spt = spt;  p->spl = spl;
	p->gu2  = gu2;     p->gu  = gu;        p->spd = spd;
	p->signo = signo;  p->dense = dense;   p->pdf_fs = pdf_fs;  
	p->width= width;   p->height = height; 

    p->am = am;   p->bs = bs;        p->cg = cg;    p->gd = gd;
    p->de = de;   p->de_eu = de_eu;  p->de_eu2 = de_eu2; 
    p->pb = pb;   p->de_na = de_na;  p->ecg    = ecg;
	p->ee = ee;   p->ee2 = ee2;      p->svm    = svm;
    p->vec= vec;  p->test = test;    p->ntu    = ntu;
    p->lp = lp;   p->mx = mx;        p->ml     = ml;  
	p->st = st;   p->tm = tm;        p->value  = value;
	p->dvc= dvc;  p->eps = eps;      p->signo  = signo;
	p->li = li;   p->cmp = cmp;   	 p->info   = info;   
	p->head = head;   p->fft = fft;  p->cmp2   = cmp2; 
	p->tail = tail;   p->rt = rt;    p->inv    = inv;
	p->de_plot = de_plot;            p->prml   = prml;

	return p;
}


/** ====== Utility for solving a quadratic equation begins ======= **/

/** utility functions **/
void *nau_memdup(void *srcp, size_t size)
{
    void *dstp;

    dstp = malloc(size);
    memcpy(dstp, srcp, size);

    return dstp;
}



/** --- Tell if a path is a directory: return 1 if y, otherwise 0 ----- **/
int nau_file_isdir(const char *path)
{
	struct stat buf;	
	int y;
	if ( stat(path, &buf) == 0 ) {
		y = S_ISDIR(buf.st_mode);
	} else {
		sprintf(dbg, "stat(\"%s\") failed.", path); 
		DBG(dbg);
		y = 0;
	}
	
	return y;
}



/*** matrix routines ***/

#define MATRIX_OP_ADD  1
#define MATRIX_OP_SUB  2
#define MATRIX_OP_MUL  3
#define MATRIX_OP_CMP  4
#define MATRIX_OP_GJE  5    // Gaussian-Jordan Elimination
#define MATRIX_OP_CID  6    // Create an identity matrix
#define MATRIX_OP_KID  7    // Check if the A is an identity matrix
#define MATRIX_OP_INV  8
#define MATRIX_OP_DUP  9
#define MATRIX_OP_DPD  10   // Dot product
#define MATRIX_OP_XPD  11   // Cross product
#define MATRIX_OP_DIST 12   // Distance from a point to a plane

#define MXMUL_TYPE_AAT  1
#define MXMUL_TYPE_ATA  2

struct matrix_dsc_s *matrix_init(char *datafilep);
struct matrix_dsc_s *matrix_init2(char *datafilep);
struct matrix_dsc_s *matrix_create(char *idp, unsigned int rows, unsigned int cols);
struct matrix_dsc_s *matrix_create2(struct matrix_op_s *mop, char *idp, unsigned int rows, unsigned int cols);
struct matrix_dsc_s *matrix_create_random(struct matrix_op_s *mop,
	unsigned int num_rows, unsigned int num_cols, double lb, double ub);

struct matrix_dsc_s *matrix_create_str(char *idp, unsigned int rows, unsigned int cols);
double *matrix_inc_row(struct matrix_dsc_s *mxp);
char **matrix_inc_row_str(struct matrix_dsc_s *mxp);
void  matrix_dec_row(struct matrix_dsc_s *mxp);
void  matrix_inc_col(struct matrix_dsc_s *mxp);
void  matrix_resize(struct matrix_dsc_s *mxp, unsigned int rn, unsigned int cn);
void  matrix_dsc_fini(struct matrix_dsc_s *mxp);
void  matrix_op_fini(struct matrix_op_s *mop);
void  matrix_op_list(struct matrix_op_s *mop);
void  matrix_op_print(struct matrix_op_s *mop);

void  matrix_op_makeblock(struct matrix_op_s *mop, unsigned int rows, unsigned int cols);
struct matrix_dsc_s *matrix_op_blksanity(struct matrix_op_s *mop, unsigned int rows, unsigned int cols);
 struct matrix_dsc_s *matrix_merge(struct matrix_dsc_s **mpp, unsigned int rows, unsigned cols);
struct matrix_dsc_s *matrix_blksanity(struct matrix_dsc_s **mpp, unsigned rows, unsigned int cols);

void  pr_center(char *id, int k, int width);
void  pr_char_center(char *id, int k, int width);
void  matrix_print(struct matrix_dsc_s *mxp);
void  matrix_printm(struct matrix_dsc_s *mxp, unsigned int ri_lo, unsigned int ri_hi);
struct matrix_dsc_s *matrix_diagram_create(char *id, unsigned int m, unsigned int n);
void  matrix_diagram_print(struct matrix_dsc_s *mxp);
void  matrix_printn(struct matrix_dsc_s *mxp, unsigned int ci_lo, unsigned int ci_hi);
void  matrix_write_file(char *fname, struct matrix_dsc_s *mxp);
void  matrix_save_binfile(char *datafilep, struct matrix_dsc_s *mxp);
void  matrix_load_binfile(char *datafilep, struct matrix_dsc_s *mxp);
struct matrix_dsc_s *matrix_init_bin(struct matrix_op_s *mop, char *datafilep);
void  matrix_op_mul(struct matrix_op_s *mop);
void  matrix_op_bt(struct matrix_op_s *mop);
void  matrix_op_dup(struct matrix_op_s *mop);
struct matrix_dsc_s *matrix_dup(struct matrix_dsc_s *mxap);
struct matrix_dsc_s *matrix_dup2(unsigned char *newid, struct matrix_dsc_s *mxap);
struct matrix_dsc_s *matrix_dup_dims(unsigned char *newid, struct matrix_dsc_s *mxap);
double matrix_bt_get_rs(struct matrix_op_s *mop, unsigned int idx);
void  matrix_solver(struct matrix_op_s *mop);
void  matrix_solver2(struct matrix_op_s *mop);
void  matrix_gen_sol(struct matrix_op_s *mop);
unsigned int matrix_op_cmp(struct matrix_op_s *mop);
void  matrix_print_sol(struct matrix_op_s *mop);
void  matrix_op_xpose(struct matrix_op_s *mop);
void  matrix_xpose(struct matrix_dsc_s *mxp);
void  matrix_op_inv(struct matrix_op_s *mop);
void  matrix_op_lud(struct matrix_op_s *mop);
void  matrix_op_dgd(struct matrix_op_s *mop);
void  matrix_op_det(struct matrix_op_s *mop);
void  matrix_op_qr(struct matrix_op_s *mop);
void  matrix_op_cid(struct matrix_op_s *mop, unsigned int n);
void  matrix_set_id(struct matrix_dsc_s *mxp);
void  matrix_update_idstring(struct matrix_dsc_s *mxp, char *idstrp);
void  matrix_rows_sum(struct matrix_dsc_s *mxp, struct matrix_dsc_s *smxp);
void  matrix_rows_avg(struct matrix_dsc_s *mxp, struct matrix_dsc_s *amxp);
struct matrix_dsc_s *matrix_delta(struct matrix_dsc_s *mxp, struct matrix_dsc_s *aamxp);
struct matrix_dsc_s *matrix_mul_aa(struct matrix_dsc_s *amxp, unsigned int type);
struct matrix_dsc_s *matrix_covar(struct matrix_dsc_s *mxp);
struct matrix_dsc_s *matrix_covar_wrt_amxp(struct matrix_dsc_s *mxp, struct matrix_dsc_s *amxp);
void  matrix_scale(struct matrix_dsc_s *mxp, double beta);

//inline double matrix_get_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j);
double matrix_get_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j);
//inline void matrix_set_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j, double val);
void matrix_set_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j, double val);
void  matrix_set_rowval(struct matrix_dsc_s *mxp, unsigned int ri, double val);
void  matrix_set_colval(struct matrix_dsc_s *mxp, unsigned int ci, double val);

unsigned int get_num_matrixes(struct matrix_op_s *mop);
unsigned int matrix_add_mxp(struct matrix_op_s *mop, struct matrix_dsc_s *mxp);
unsigned int matrix_del_mxp(struct matrix_op_s *mop, struct matrix_dsc_s *mxp);
unsigned int matrix_del_mxp_str(struct matrix_op_s *mop, char *mxidp);
void matrix_copy_ab(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp );
void  matrix_op_sort_A(struct matrix_op_s *mop);
void  matrix_compute_inv(struct matrix_dsc_s *mxp, struct matrix_dsc_s *invmxp);
void  matrix_adj_a2b(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp );
void  matrix_mul_ab(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp );
void  matrix_mul_abb(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp );
void  matrix_mul_aTbb(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp );
void  matrix_mul_aTba(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp );
struct matrix_dsc_s *matrix_mul_ab2(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp);
void  matrix_mul_abc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void  matrix_mul_ab_augc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void  matrix_mul_aTbc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void  matrix_mul_adeaugT_bc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void matrix_mul_adeaug_bc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void  matrix_mul_abTc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void  matrix_mul_abT_augc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
void  matrix_mul_aT_deaugb_c(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp);
double  matrix_mul_vT_A_u(double *vp, struct matrix_dsc_s *mxp, double *up);
double  matrix_mul_vT_A_u2(double *vp, struct matrix_dsc_s *mxp, double *up, double *tp);
void matrix_add_A_c_vuT(struct matrix_dsc_s *Hmxp, struct matrix_dsc_s *Amxp, double c, double *vp, double *up);
void matrix_H1_sub_c_AvuT_B(struct matrix_dsc_s *H1mxp, double c,
 struct matrix_dsc_s *Amxp, double *vp, double *up, struct matrix_dsc_s *Bmxp);

unsigned int mxmul_check_abc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);
unsigned int mxmul_check_aTbc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);
unsigned int mxmul_check_abTc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);
unsigned int nn_matrix_eq3(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp);

void nlnopt_bfgs(struct matrix_dsc_s *H1mxp, struct matrix_dsc_s *Hmxp, double *deltap, double *gammap);
void nlnopt_dfp(struct matrix_dsc_s *H1mxp, struct matrix_dsc_s *Hmxp, double *deltap, double *gammap);

void nlnopt_mul_H_x(double *gp, double alpha, struct matrix_dsc_s *Hmxp, double *xp);
void nlnopt_vectors_sum(unsigned int n, double *wp, double *up, double *vp, double alpha);

void  matrix_op_luft(struct matrix_op_s *mop);
void  matrix_op_lubt(struct matrix_op_s *mop);
void  matrix_op_swrow(struct matrix_op_s *mop, unsigned int r1, unsigned int r2);
void  matrix_swrow(struct matrix_dsc_s *mxp, unsigned int r1, unsigned int r2);
void  matrix2row_echelon(struct matrix_dsc_s *mxp);
void  matrix_decompose_cholesky(struct matrix_dsc_s *mxp);
void  matrix_decompose_cholesky2(struct matrix_dsc_s *mxp);
void  matrix_decompose_crout(struct matrix_dsc_s *mxp);

void  vector_col2row(double **colpp, double *rowp, unsigned int sz, unsigned int col_num);
void  vector_row2col( double *rowp, double **colpp, unsigned int sz, unsigned int col_num);
unsigned int matrix_is_sdiagdom(struct matrix_dsc_s *mxp);

double *gje_sub_get_ap(struct matrix_dsc_s *mxp, unsigned int i);
void  lud_sub_get_ap_ip(struct matrix_dsc_s *mxp, struct matrix_dsc_s *mxlp, unsigned int i, double **app, double **ipp);

void   vector_zero(unsigned int n, double *vp);
double vectors_inner_prod(unsigned int n, double *v1p, double *v2p);
struct matrix_dsc_s *vectors_outer_prod(unsigned int m, unsigned int n, double *mp, double *np, double scale);
double vector_norm(unsigned int degree, unsigned int n, double *vp);
double vector_sum(unsigned int n, double *vp);
void   vector_scale(unsigned int n, double *vp, double l);
void   vector_normalize(unsigned int n, double *vp);
void   vector_dup(unsigned int n, double *dstp, double *srcp);
double *vector_dup2(unsigned int n, double *srcp);
void   vector_dup_int(unsigned int n, unsigned int *dstp, unsigned int *srcp);
void   vector_u_minus_v(unsigned int n, double *up, double *vp);
void   vector_u_plus_v(unsigned int n, double *up, double *vp);
void   vector_add_scaled(unsigned int n, double *up, double *vp, double l);
void   vector_add_scalar(unsigned int n, double *vp, double scalar);
void   vector_u_proj_on_v(unsigned int n, double *up, double *vp);
double vector_norm_sqrt(unsigned int n, double *vp);
double *vector_housevec(unsigned int n, double *xp, double *beta);
void  vector_housevec2(unsigned int n, double *xp, double *hvp, double *beta);
struct matrix_dsc_s *matrix_housemat(struct matrix_op_s *mop, unsigned int n, double *vp, double beta);
void matrix_submat_map(struct matrix_dsc_s *hmxp, struct matrix_dsc_s *submxp, unsigned int moff, unsigned int noff);
void matrix_submat_map2(struct matrix_dsc_s *hmxp, struct matrix_dsc_s *submxp, unsigned int moff, unsigned int noff);

void matrix_hh_premul(unsigned int n, double *vp, double beta, struct matrix_dsc_s *amxp, struct matrix_dsc_s *pamxp);
void matrix_hh_postmul(unsigned int n, double *vp, double beta, struct matrix_dsc_s *amxp, struct matrix_dsc_s *apmxp);
void matrix_hh_postmul2(unsigned int n, double *vp, double beta, struct matrix_dsc_s *amxp, struct matrix_dsc_s *apmxp);
void matrix_givens_rotate(double a, double b, double *cp, double *sp);
void matrix_givens_coord(double a, double b, double *newa, double *newb, double *cp, double *sp);
void matrix_givens_compute(struct matrix_dsc_s *gmxp, double a, double b, unsigned int flag_xpose);

unsigned int matrix_load_colvec(struct matrix_dsc_s *mxp, unsigned int ci, unsigned int ne, double *vp, unsigned int k);
unsigned int matrix_save_colvec(struct matrix_dsc_s *mxp, unsigned int ci, unsigned int ne, double *vp, unsigned int k);
unsigned int matrix_load_rowvec(struct matrix_dsc_s *mxp, unsigned int ri, unsigned int ne, double *vp, unsigned int k);
unsigned int matrix_save_rowvec(struct matrix_dsc_s *mxp, unsigned int ri, unsigned int ne, double *vp, unsigned int k);

void  matrix_houseQR(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp);
void  matrix_hessQZ(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Bmxp, 
	struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Zmxp);
void  matrix_hessQR(struct matrix_dsc_s *Hmxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp);
void  matrix_francisQR(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp);
void  matrix_francisQR_init(struct matrix_dsc_s *Amxp, double *xp);
void  matrix_francisQR_column(struct matrix_dsc_s *Amxp, double *xp, unsigned int k);
struct matrix_dsc_s *matrix_backaccumQR(struct matrix_dsc_s *Amxp, unsigned int type);
struct matrix_dsc_s *matrix_backaccum_francisQ(struct matrix_dsc_s *mxp);
void  matrix_givensQR(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp);
void  matrix_houseHessenberg(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Hmxp, struct matrix_dsc_s *Umxp);
void  matrix_givensCS(struct matrix_dsc_s *gmxp, struct matrix_dsc_s *rmxp, struct matrix_dsc_s *qmxp);
void  matrix_tri_house(struct matrix_dsc_s *mxp, unsigned int k, unsigned int type);
void  matrix_bidiag_house(struct matrix_dsc_s *mxp);
void  matrix_tridiag_house(struct matrix_dsc_s *mxp);
void  matrix_houseSchur(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Smxp, struct matrix_dsc_s *Umxp);

void  matrix_vector_mul(struct matrix_dsc_s *mxp, double *up, double *vp, double scale);
void  matrix_hh_rank2(struct matrix_dsc_s *mxp, double *vp, double *up);		
void  matrix_set_uptriangular(struct matrix_dsc_s *mxp, unsigned int k);
void  matrix_set_lowtriangular(struct matrix_dsc_s *mxp, unsigned int k);
void  matrix_set_tridiag(struct matrix_dsc_s *mxp);
void  matrix_shift(struct matrix_dsc_s *mxp, double mu);
void  matrix_zero_eps(struct matrix_dsc_s *mxp, double eps);
void  matrix_set_givens(struct matrix_dsc_s *mxp, double c, double s);
void matrix_givens_transpose(struct matrix_dsc_s *mxp);
struct matrix_dsc_s *matrix_givens_gen(double theta);

void  matrix_step_QR(struct matrix_dsc_s *mxp);
void  matrix_hhvec_print(unsigned int n, double *vp, double beta);

void  matrix_submat_diagram(struct matrix_dsc_s *mxp, unsigned int rlo, 
	unsigned int rhi, unsigned int clo, unsigned int chi, unsigned int type);


double *matrix2vector(struct matrix_dsc_s *mxp, unsigned int *ne, unsigned int row);
struct matrix_dsc_s *vector2matrix(unsigned int ne, double *vp, 
	unsigned int nr, unsigned int nc, unsigned int row);
void matrix_mulsub_ab(struct matrix_dsc_s *amxp, struct matrix_dsc_s *gmxp, 
  struct matrix_dsc_s *smxp, struct matrix_dsc_s *tmxp, unsigned int moff, 
  unsigned int noff, unsigned int type);
void matrix_mulsub_ab_cols( struct matrix_dsc_s *amxp, struct matrix_dsc_s *gmxp, struct matrix_dsc_s *smxp, struct matrix_dsc_s *tmxp, unsigned int doff);

void  matrix_francis_shift(struct matrix_dsc_s *mxp, double *sum, double *prod);
void  matrix_francs_first_column(struct matrix_dsc_s *mxp);

void  vector_print2(unsigned int n, double *vp);
void  vector_sprint(char *msg, unsigned int n, double *vp);
void  vector_write_file(char *file, unsigned int n, double *vp, unsigned int f);
void  vector_rand01(unsigned int n, double *vp);

void  matrix_mul_aTbcd(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp, struct matrix_dsc_s *dmxp);
void  matrix_mul_abcTd(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp, struct matrix_dsc_s *dmxp);
void  matrix_mul_aTbad(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *dmxp);
void  matrix_mul_abaTd(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *dmxp);


/* 
 *    Generate a Givens rotation matrix.
 *    The CCW rotation angle is set by the input argument theta.
 */
struct matrix_dsc_s *matrix_givens_gen(double theta)
{
	char id[1024];
	double c, s;
	struct matrix_dsc_s *gmxp;

	sprintf(id, "Givens rotation CCW %g (rad.)", theta); 
	gmxp = matrix_create(id, 2, 2);
	c = cos(theta), s = sin(theta);
	matrix_set_givens(gmxp, c, s);

	return gmxp;
}



/*
 *  Use the LAPACK FORTRAN subroutines to do the related 
 *  matrix decompositions.
 *
 */
extern void dgesvd_(char *ju, char*jv, int *m, int*n, double *A, int *lda, double *s, 
	 double *u, int *ldu, double *vt, int *ldvt, double *work, int*lwork,  int *info);



struct matrix_dsc_s *matrix_mul_aaT(struct matrix_dsc_s *amxp, struct matrix_dsc_s *cmxp)
{
	unsigned int m, n;
	struct matrix_dsc_s *pmxp;

	if ( cmxp ) {
		pmxp = cmxp;
	} else {
		m = n = MXROWS(amxp);
		pmxp = matrix_create("A * A.T product", m, n);
		nn_mul_abTc(amxp, amxp, pmxp);
	}

	return pmxp;
}


struct matrix_dsc_s *matrix_mul_aTa(struct matrix_dsc_s *amxp, struct matrix_dsc_s *cmxp)
{
	unsigned int m, n;
	struct matrix_dsc_s *pmxp;

	if ( cmxp ) {
		pmxp = cmxp;
	} else {
		m = n = MXCOLS(amxp);
		pmxp = matrix_create("A.T * A product", m, n);
		nn_mul_aTbc(amxp, amxp, pmxp);
	}

	return pmxp;
}

struct matrix_dsc_s *matrix_mul_aa(struct matrix_dsc_s *amxp, unsigned int type)
{
	struct matrix_dsc_s *pmxp = NULL;

	if ( type == MXMUL_TYPE_AAT ) {
		pmxp = matrix_mul_aaT(amxp, NULL);
	} else if ( type == MXMUL_TYPE_ATA ) {
		pmxp = matrix_mul_aTa(amxp, NULL);
	}

	return pmxp;
}


void   matrix_op_makeblock(struct matrix_op_s *mop, unsigned int rows, unsigned int cols)
{
	struct matrix_dsc_s *mxp;

	mxp = matrix_merge(mop->mpp, rows, cols);
	
	matrix_add_mxp(mop, mxp);
}

struct matrix_dsc_s *matrix_merge(struct matrix_dsc_s **mpp, unsigned int rows, unsigned cols)
{
	unsigned int i, j, k, l, m, n, p, q, c, blkrow_off, blkcol_off;
	struct matrix_dsc_s *mxp, *smxp;
	double **rpp, *rp, **prpp, *prp, **srpp, *srp;

	mxp = matrix_blksanity(mpp, rows, cols);

	//c = mop->offset,  m = mop->blkrows, n = mop->blkcols;
	c = 0,  m = rows, n =cols;

	rpp = MXRPP(mxp);

	for ( i = 0; i < m; i++ ) {
		blkcol_off = 0; 
		for ( j = 0; j < n; j++ ) {
			// Go thru all the matrixces on a "row"
			//
			smxp = mpp[ c++];
			srpp = MXRPP(smxp);
			p = MXROWS(smxp);
			q = MXCOLS(smxp);
			
			matrix_print(smxp);

			for ( k = 0; k < p; k++ ) {
				rp = rpp[k] + blkcol_off;
				srp = srpp[k];
				for ( l = 0; l < q; l++ ) {
					rp[l] = srp[l];
				}	
			}
			blkcol_off += q;
		}
		// After merging search row of matrices, increase the
		// pointers of rpp to next starting rpp of row of matrices.
		rpp += MXROWS(smxp);
	}

	matrix_print(mxp);
	
	return mxp;
}

struct matrix_dsc_s *matrix_op_blksanity(struct matrix_op_s *mop, 
	unsigned int rows, unsigned int cols)
{	
	struct matrix_dsc_s *mxp;

	mxp =  matrix_blksanity(mop->mpp, rows, cols);
	matrix_add_mxp(mop, mxp);

	return mxp;
}

struct matrix_dsc_s *matrix_blksanity(struct matrix_dsc_s **mpp, 
	unsigned int rows, unsigned int cols)
{
	unsigned int y, i, j, m, n, c, num_rows, num_cols;
	struct matrix_dsc_s *mxp, *smxp;

	m = rows,  n = cols, y = 1; c = 0;
	
	num_rows = 0;
	sprintf(dbg, "m, n = %d, %d", m, n); DBG(dbg); 
	for ( i = 0; i < m; i++ ) {
		num_cols = 0;
		for ( j = 0; j < n; j++ ) {
			//smxp = MOP2MXP(mop, c++);
			smxp = mpp[c++];
			num_cols += MXCOLS(smxp);
			if ( j == 0 ) {
				rows = MXROWS(smxp);
			} else if ( rows != MXROWS(smxp) ) {
				sprintf(dbg, "for block matrix, the rows of matrices should be the same previous %d, now =%d",
				rows, MXROWS(smxp)); DBG(dbg); 
				y = 0;
				break;
			}
		}
		
		if ( !y ) {
			break;
		}

		if ( i == 0 ) {
			cols = num_cols;
		} else if ( cols != num_cols ) {
			
			sprintf(dbg, "for block matrix, the columns of matrices should be the same previous %d, now =%d",
			cols, num_cols); DBG(dbg); 
			y = 0;
			break;
		}

		num_rows += MXROWS(smxp);
	}

	if ( y ) {
		mxp = matrix_create("block matrix ", num_rows, num_cols);
	} else {
		mxp = NULL;
	}
	
	return mxp;
}


/** Switch the the rows specified by r1 and r2 in matrix A **/
void matrix_op_swrow(struct matrix_op_s *mop, unsigned int r1, unsigned int r2)
{
    struct matrix_dsc_s *mxp;

    mxp = mop->mpp[ mop->ia ];  // Get the pointer to matrix A
    matrix_swrow(mxp, r1, r2);
}

/** Switch the the rows specified by r1 and r2 in matrix pointed to by mxp **/
void matrix_swrow(struct matrix_dsc_s *mxp, unsigned int r1, unsigned int r2)
{
    double *p;

    if ( (r1 < mxp->num_rows) && (r2 < mxp->num_rows) ) {
        p = mxp->rowp[r1];
        mxp->rowp[r1] = mxp->rowp[r2];
        mxp->rowp[r2] = p;
    }
}


unsigned int get_num_matrixes(struct matrix_op_s *mop)
{
	return mop->num_matrixes;
}

/*
 *   Create a random matrix with uniform distribution 
 *   matrix size is  num_rows X num_cols
 *   range of the random number is [lb, ub]
 */
struct matrix_dsc_s *matrix_create_random(struct matrix_op_s *mop,
	unsigned int num_rows, unsigned int num_cols, double lb, double ub)
{
	unsigned int i, j, m, n;
	unsigned char buf[BUF_LEN], dsc[] = "random matrix (%4d x %4d) with uniform\
 distribution low bound =%g upper bound = %g";
    double a, b, **rpp, *rp;
	struct matrix_dsc_s *mxp;
    
	m = num_rows, n = num_cols;

	sprintf(buf, dsc, m, n, lb, ub); DBG(dbg); 
  	
	mxp = matrix_create(buf, m, n);

	if ( mop ) {
		matrix_add_mxp(mop, mxp);
	}

	a = ub - lb,   b = lb;
	rpp = MXRPP(mxp);
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			rp[j] = ran1() * a + b;
		}
	}

    return mxp;
}

/** ------------ Init a matrix elements to be random numbers ----------- **/
void matrix_init_random(struct matrix_dsc_s *mxp, double lb, double ub)
{
	unsigned int i, j, m, n;
	double **rpp, *rp, a, b;

	a = ub - lb,   b = lb;
	//sprintf(dbg, " a = %g  b = %g", a, b); DBG(dbg); 
	
	rpp = MXRPP(mxp);
	m = MXROWS(mxp);
	n = MXCOLS(mxp);

	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			rp[j] = ran1() * a + b;
		}
	}
}

void matrix_random_decision(struct matrix_dsc_s *mxp)
{
	MXPARAMS(mxp, i, j, m, n, rpp, rp);
	
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			rp[j] = (ran1() > rp[j]) ? 1.0 : 0.0;
		}
	}
}

struct matrix_dsc_s *matrix_create2(struct matrix_op_s *mop,
    char *idp, unsigned int num_rows, unsigned int num_cols)
{
    struct matrix_dsc_s *mxp;
    
    mxp = matrix_create(idp, num_rows, num_cols);
    matrix_add_mxp(mop, mxp);

    return mxp;
}


struct matrix_dsc_s *matrix_create(char *idp, unsigned int num_rows, unsigned int num_cols)
{
    unsigned int i, m, n;
    struct matrix_dsc_s *mxp;

    mxp = calloc(1, sizeof(struct matrix_dsc_s) );
	if ( !mxp ) {
		sprintf(dbg, "error 1: %d", errno); DBG(dbg); 
		return NULL;
	}
    mxp->id = strdup(idp);
    mxp->num_rows = m = num_rows;
    mxp->num_cols = n = num_cols;
	
    mxp->rowp = malloc( m * sizeof( double *) );
	if ( !mxp->rowp ) {
		sprintf(dbg, "error 2: %d", errno); DBG(dbg); 
		free(mxp->id);
		free(mxp);
		return NULL;
	}
    
    for ( i = 0; i < m; i++ ) {
        // (n+1) is just for augumenting one more column.
        mxp->rowp[i] = calloc( (n + 1), sizeof(double) );
		if ( !mxp->rowp[i] ) {
			sprintf(dbg, "error 3: %d", errno); DBG(dbg); 
			free(mxp->id);
			free(mxp->rowp);
			free(mxp);
			return NULL;
		}
    }
    
    return mxp;
}

/*
 *  Read a data set in a file into an existing matrix structure.
 *  The number of rows and columns of the data set should match
 *  the number of data lines and the number of data files, 
 *  respectively.
 *
 */
void matrix_load_file(struct matrix_dsc_s *mxp, char *datafilep)
{

    unsigned int i, j, n, m, num_rows, num_cols, y_2ints, dl, ln, fl_amb;
    char c, fn[DS_LEN], xs[DS_LEN], buf[DS_LEN], *p, *ctx, *s2 = " \t,\r\n"; 
    FILE *fp;
	double **rpp, *rp; 
	long  fp_off;
    
	dl = 0, ln = 0, fl_amb = 0;
	num_rows = MXROWS(mxp);    rpp = MXRPP(mxp);
	num_cols = MXCOLS(mxp);
	
	/** remove all the whitespace chars in the file name. **/
    j = 0;
    for ( i = 0; i < strlen(datafilep); i++ ) {
        if ( !isspace(datafilep[i]) )
            fn[j++] = datafilep[i];
    }
    fn[j] = '\0';
    
	fp = fopen(fn, "r");
	if (!fp ) {
        sprintf(dbg, "Fatal error, could not open file \"%s\" : reason = %s",
        datafilep, (const char *)strerror(errno) );
        DBG(dbg); 
        mxp = NULL;
        exit(1);
    }
    
	i = 0;
	rp = rpp[i];

	// ---  Read the first data line to identify how many columns there are in a line. ---
    while ( fgets(buf, sizeof(buf), fp ) ) {
		ln++;
		c = buf[0];
        if ( !( c == '\n' || c == '#' || c == '%' || c == '!' ) ) {
			
			y_2ints = matrix_line_2ints(buf);

			if ( y_2ints ) {
				// This is inherently ambiguous.
				p = strtok_r(buf,  s2, &ctx);   m = atoi(p);
				p = strtok_r(NULL, s2, &ctx);   n = atoi(p);
				if ( (num_rows == m) && (num_cols == n)  ) {

				} else {
					rp[0] = m;   
					rp[1] = n;   
					i++;
				}
			} else {
				// Check how many columns there are.
				matrix_load_one_row(num_cols, rp, buf);
				i++;
			}
			break;  // Assume we got the first data line
		}
	}

	while ( fgets(buf, sizeof(buf), fp ) ) {
		ln++;
		c = buf[0];
        if ( !(c == '\n' || c == '#' || c == '%' || c == '!' ) ) {
            matrix_load_one_row(num_cols, rpp[i++], buf);
        }
	}
	
	fclose(fp);
	
	if ( i != num_rows ) {
		sprintf(dbg, "Error: num_rows = %d  i = %d",
			num_rows, i); DBG(dbg); 
	}

}

struct matrix_dsc_s *matrix_create_str(char *idp, unsigned int num_rows, unsigned int num_cols)
{
    unsigned int i, n, m;
    struct matrix_dsc_s *mxp;

    mxp = calloc(1, sizeof(struct matrix_dsc_s) );
    mxp->id = strdup(idp);
    mxp->num_rows = n = num_rows;
    mxp->num_cols = m = num_cols;

    mxp->srowppp = malloc( n * sizeof( char **) );
    
    for ( i = 0; i < n; i++ ) {
        // (m+1) is just for augumenting one more column.
        mxp->srowppp[i] = calloc( (m + 1), sizeof(char *) );
    }
    
    return mxp;
}


unsigned int matrix_slice_params_valid(unsigned int n, unsigned int *np, unsigned int max_n)
{
    unsigned int i, valid = 1;

    for ( i = 0; i < n; i++ ) {
        if ( np[i] >= max_n ) {
            valid = 0;
            break;
        }
    }
    
    return valid;
}


/** matrix_slice data xfer only, tomxp matrix was already created 
 *  appropirately upon entry this routine **/
void matrix_slice_xo(struct matrix_dsc_s *tomxp, struct matrix_dsc_s *frommxp)
{
    unsigned int valid, i, j, m, n, rn, cn;
    struct matrix_dsc_s *newmxp, *mxp;
    double *mrp, *newmrp;
    unsigned char id[BUF_LEN];

    newmxp = tomxp;
    mxp = frommxp;
    m = mxp->m;
    n = mxp->n;
    for ( i = 0;  i < m; i++ ) {
        newmrp = newmxp->rowp[i];
        rn = mxp->mp[i];
        mrp = mxp->rowp[rn];
        //newmrp = newmxp->rowp[i];
        for ( j = 0; j < n; j++ ) {
            cn = mxp->np[j];
            newmrp[j] = mrp[cn];
        }
    }
}

/* 
 *  Pull out the sliced rows and columns in matrix mxp and
 *  put them into a newly created matrix and return the pointer
 *  to the newly created and filled matrix.
 *    
 *    rows to slice is in mxp->mp, m is the number indexes in the list mp
 *    cols to slice is in mxp->np, n is the number indexes in the list np
 *
 */
struct matrix_dsc_s *matrix_slice(struct matrix_dsc_s *mxp)
{
    unsigned int valid, i, j, m, n, rn, cn;
    struct matrix_dsc_s *newmxp;
    double *mrp, *newmrp;
    unsigned char id[BUF_LEN];

    m = mxp->m;   n = mxp->n;
  
    if ( m && n && (m <= mxp->num_rows) && (n <= mxp->num_cols) &&
        matrix_slice_params_valid(m, mxp->mp, mxp->num_rows) &&
        matrix_slice_params_valid(n, mxp->np, mxp->num_cols) ) {
        
        sprintf(id, "sliced matrix of %s", mxp->id);  DBG(id);
        newmxp = matrix_create(id, m, n);
        matrix_slice_xo(newmxp, mxp);
    } else {
        newmxp = NULL;
    }

    return newmxp;
}

/** Fill in the slice matrix smxp from the original matrix mxp 
 *  The smxp storage must be already created with the approximate dimension.
 *
 **/

/**
 *
 *  This is the auxillary routine to matrix_slice().  It sets 
 *  the params in matrix mxp for the rows and cols to be pulled 
 *  out to form a new sliced matrix.  Note the index list mxp->mp 
 *  and mxp->np need to be set up separately, not in this routine.
 *
 **/
void matrix_slice_params(struct matrix_dsc_s *mxp, unsigned int num_rows, unsigned int num_cols)
{
    unsigned int m, n;

    m = num_rows;
    n = num_cols;
    
    mxp->m = m;   mxp->mp = malloc( m * sizeof(unsigned int) );
    mxp->n = n;   mxp->np = malloc( n * sizeof(unsigned int) );
}

/**----- Gen a range of unsigned ints and store them in a list ---**/
//  the index list contains unsigned ints in range [head, tail) ...
unsigned int *nau_gen_range(unsigned int head, unsigned int tail)
{
	unsigned int *ip, i, n;
	
	if ( tail <= head ) {
		ip = NULL;
	} else {
		n = tail - head;
		ip = MALLOC(n, int);
		if ( ip ) {
			//sprintf(dbg, "n = %d, (head, tail)=(%d, %d)", n, head, tail); DBG(dbg); 
			for ( i = head; i < tail; i++ ) {
				ip[i-head] = i;
			}
		} else {
			sprintf(dbg, "ip no memory callocated ... %d",
				errno); DBG(dbg); 
		}
	}

	return ip;
}

/**
 *  This is the auxillary routine to matrix_slice().  It sets the row 
 *  index and column index for the elements in the matrix to be
 *  pulled out to form a new sliced matrix. 
 *  NB: mxp->m specifies the number of data pointed to by rdp
 *    mxp->n specifies the number of data pointed to by cdp
 **/
//void matrix_slice_set_params(struct matrix_dsc_s *mxp, double *rdp, double *cdp)
void matrix_slice_set_params(struct matrix_dsc_s *mxp, unsigned int *rdp, unsigned int *cdp)
{
    unsigned int i, j, n, m;

    m = mxp->m;
    n = mxp->n;
    for ( i = 0; i < m; i++ ) {
        mxp->mp[i] = rdp[i];
    }

    for ( j = 0; j < n; j++ ) {
        mxp->np[j] = cdp[j];
    }
}

struct matrix_dsc_s *matrix_slice_mn(struct matrix_dsc_s *mxp, 
	unsigned int m, unsigned int *mp, unsigned int n, unsigned int *np)
{
    unsigned int i, j;
    struct matrix_dsc_s *smxp;

    
    mxp->m = m;   mxp->mp = malloc( m * sizeof(unsigned int) );
    mxp->n = n;   mxp->np = malloc( n * sizeof(unsigned int) );
    vector_dup_int(m, mxp->mp, mp);  // duplicate the row index list
    vector_dup_int(n, mxp->np, np);  // duplicate the column index list
    DBG("running matrix_slice() ...."); 
    smxp = matrix_slice(mxp);

    return smxp;
}

/**  Resize a matrix to the size of (rn x cn)  **/
void matrix_resize(struct matrix_dsc_s *mxp, unsigned int rn, unsigned int cn)
{
    unsigned int i, n, m;

    n = MXROWS(mxp), m = MXCOLS(mxp);

    if ( rn == n ) {
        if ( cn == m ) {
            // Do nothing
    
        } else {
            // num of rows are the same.
			if ( mxp->no_rp ) {
				// Do nothing.
			} else {
            	for ( i = 0; i < n; i++ ) {    
                	mxp->rowp[i] = realloc( mxp->rowp[i], (cn+1) * sizeof(double) );
            	}
			}
            mxp->num_cols = cn;
        }

    } else {
        
        if ( cn == m ) {
            if ( rn < n ) { // shrink the rows.
				if ( mxp->no_rp ) {
					// Do nothing.
				} else {
					for ( i = rn; i < n; i++ ) {
                    	free(mxp->rowp[i]);
                	}
                }
                mxp->rowp = realloc(mxp->rowp, rn * sizeof(double *));
            } else {
                // In this case, rn > n, expand the rows
                mxp->rowp = realloc(mxp->rowp, rn * sizeof(double *) );
				if ( mxp->no_rp ) {
					// Do nothing.
				} else {
                	// Allocate the storage for the added rows.
                	for ( i = n; i < rn; i++ ) {
                 	   mxp->rowp[i] = calloc(cn+1, sizeof(double) );
					}
                }
            
            } 
            mxp->num_rows = rn;
        } else {
            // In this case neither the number of columns 
            // nor the number of rows is the same. 
            //
            if ( rn < n ) {
				if ( mxp->no_rp ) {
					// Do nothing.
				} else {
				
                	// free the extra rows then
                	for ( i = rn; i < n; i++ ) {
                    	free(mxp->rowp[i]);
					}
                }
                mxp->rowp = realloc(mxp->rowp, rn * sizeof(double*) );
				if ( mxp->no_rp ) {
					// Do nothing.
				} else {
                	// Does not matter if the cn > m or cn < m.
                	for ( i = 0; i < rn; i++ ) {
                    	mxp->rowp[i] = realloc(mxp->rowp[i], (cn+1)* sizeof(double) );
					}
                }
            
            } else { // rn > n
                mxp->rowp = realloc(mxp->rowp, rn * sizeof(double *));
				if ( mxp->no_rp ) {
					// Do nothing.
				} else {
                	// Expand rows.
					for ( i = n; i < rn; i++ ) {
                    	mxp->rowp[i] = malloc( sizeof(double) );
                	}
                
                	// realloc() to make the columns "aligned."
                	for ( i = 0; i < rn; i++ ) {
                    	mxp->rowp[i] = realloc(mxp->rowp[i], (cn+1) * sizeof(double) );
                	}
				}
            }
        }
    }

    mxp->num_rows = rn;
    mxp->num_cols = cn;
}


/**  Add one column in the specified matrix.  **/
void matrix_inc_col(struct matrix_dsc_s *mxp)
{
    unsigned int i, n, m;

    n = mxp->num_rows;
    m = mxp->num_cols;

    for ( i = 0; i < n; i++ ) {    
        mxp->rowp[i] = realloc( mxp->rowp[i], (m+2) * sizeof(double) );
    }

    mxp->num_cols++;
}

/**  Add one row in the specified matrix.  **/
double *matrix_inc_row(struct matrix_dsc_s *mxp)
{
    unsigned int n, m;
    double *rp;

    n = mxp->num_rows;
    m = mxp->num_cols;
    
    mxp->rowp = realloc(mxp->rowp, (n+1) * sizeof(double *) );
    mxp->rowp[n] = rp = calloc( m+1, sizeof(double) ); // Extra oolumn is for augumented matrix.
    mxp->num_rows++;

    return rp;
}

char **matrix_inc_row_str(struct matrix_dsc_s *mxp)
{
    unsigned int n, m;
    char **rpp;

    n = mxp->num_rows;
    m = mxp->num_cols;
    
    mxp->srowppp = realloc(mxp->srowppp, (n+1) * sizeof(char **) );
    mxp->srowppp[n] = rpp = calloc( m+1, sizeof(char *) ); // Extra oolumn is for augumented matrix.
    mxp->num_rows++;

    return rpp;
}

void matrix_dec_row(struct matrix_dsc_s *mxp)
{
    unsigned int n, m;
    double *rp;
    
    n = mxp->num_rows-1;
    rp = mxp->rowp[n];
    free(rp);
    mxp->rowp = realloc(mxp->rowp, n * sizeof(double *));
    
    mxp->num_rows = n;

}


/*
 *   Test the matrix is a row vector.
 */
unsigned int matrix_is_row(struct matrix_dsc_s *mxp)
{
    return mxp->num_rows == 1 ? 1 : 0;
}

/*
 *   Test the matrix is a column vector.
 */
unsigned int matrix_is_column(struct matrix_dsc_s *mxp)
{
    return mxp->num_cols == 1 ? 1 : 0;
}


/*
 *   Transpose a matrix
 *
 */
void matrix_xpose(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m;
    double t, *rp, *cp, **rowp;

    n = MXROWS(mxp);   
    m = MXCOLS(mxp);  
    
    if ( n == m ) {   // This is a square matrix.
        for ( i = 0; i < n-1; i++ ) {
            rp = mxp->rowp[i];
            for ( j = i+1; j < m; j++ ) {
                cp = mxp->rowp[j];
                t = cp[i];
                cp[i] = rp[j];
                rp[j] = t;
            }
        }
    
    } else {  // This is not a square matrix.
        
        // 1. Create a new row pointer array.
        rowp = malloc( m * sizeof(double *) );
        for ( i = 0; i < m; i++ ) {
            rowp[i] = rp = malloc( (n+1) * sizeof(double ) );
            for ( j = 0; j < n; j++ ) { // move along columns of
                cp = mxp->rowp[j];      // existing matrix row;
                rp[j] = cp[i];
            }
        }
        
        // 2. Free the old one.
        for ( i = 0; i < n; i++ ) {
            rp = mxp->rowp[i];
            free(rp);
        }
        free(mxp->rowp);
        
        // 3. Set the new metric for the transposed matrix.
        mxp->num_rows = m;
        mxp->num_cols = n;
        mxp->rowp = rowp;
    }
}


/*
 *   Transpose a matrix: version 2
 *   the storage fo the transposed matrix was already 
 *   created, its argument is tmxp.
 */
void matrix_xpose2(struct matrix_dsc_s *mxp, struct matrix_dsc_s *tmxp)
{
    unsigned int i, j, n, m;
    double t, *rp, *cp, **rowp;

    n = MXROWS(mxp); 
    m = MXCOLS(mxp); 
    
    if ( n == m ) {   // This is a square matrix.
		matrix_copy_ab(mxp, tmxp);   // Copy all the data over to xposed matrix first

        for ( i = 0; i < n-1; i++ ) { // then transposing
            rp = tmxp->rowp[i];
            for ( j = i+1; j < m; j++ ) {
                cp = tmxp->rowp[j];
                t = cp[i];
                cp[i] = rp[j];
                rp[j] = t;
            }
        }
    
    } else {  // This is not a square matrix.
        
        rowp = MXRPP(tmxp);
        for ( i = 0; i < n; i++ ) {
            cp = mxp->rowp[i];
            for ( j = 0; j < m; j++ ) { // move along column
                //cp = mxp->rowp[j];    // existing matrix rows;
                rp  = rowp[j];
				rp[i] = cp[j];
            }
        }
    }

}

/*
 *   Transpose the matrix indexed by mop->it.
 *
 */

void matrix_op_xpose(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *mxp;

    mxp = mop->mpp[ mop->it ];
    matrix_xpose(mxp);
}

/*
 *   Set each and every element in the matrix to zero.
 *
 */
void matrix_clear(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m;
    double *rp;

    n = mxp->num_rows;
    m = mxp->num_cols;
    
    for ( i = 0; i < n; i++ ) {
        rp = mxp->rowp[i];
        for ( j = 0; j < m; j++ ) {
            rp[j] = 0;
        }
    }
}

/*
 *   Negate the sign of each and every element in the matrix
 *
 */
void matrix_nsign(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m;
    double *rp;

    n = mxp->num_rows;
    m = mxp->num_cols;
    
    for ( i = 0; i < n; i++ ) {
        rp = mxp->rowp[i];
        for ( j = 0; j < m; j++ ) {
            rp[j] = -rp[j];
        }
    }
}

/* matrix utility: add the created matrix dsc to the mop->mpp array
 * so that MOP can keep track of it. */
unsigned int matrix_add_mxp(struct matrix_op_s *mop, struct matrix_dsc_s *mxp)
{
    unsigned int id = mop->num_matrixes;

    mop->mpp = realloc( mop->mpp, (mop->num_matrixes + 1) * sizeof(struct matrix_dsc_s *) );
    mop->mpp[ mop->num_matrixes++ ] = mxp;
    if ( mop->num_matrixes > (1 << 8 ) ) {
        sprintf(dbg, "Warning matrix index > %d", (1<<9) ); DBG(dbg);
    }
    return id;  // The matrix index/id in mop->mpp.
}

unsigned int matrix_del_mxp(struct matrix_op_s *mop, struct matrix_dsc_s *mxp)
{
    unsigned int i, k, n, y = 0;
	struct matrix_dsc_s **mpp, *p;

	if ( !mxp ) {
		DBG("Warning: no mxp specified. Nothing was done."); 
		return y;
	}

    mpp = mop->mpp, k = 0;
	n = mop->num_matrixes;

	for ( i = 0; i < n; i++ ) {
		p = mpp[i];
		if ( p == mxp ) {
			//sprintf(dbg, " removed mxp %s", MXID(mxp)); DBG(dbg); 
			matrix_dsc_fini(mxp);
			y = 1;
		} else {
			mpp[k++] = p;
		}
	}
	
	// --- We don't shrink the mop->mpp in hope that matrix_add_mxp()
	// would adjust it later on.
	if ( y ) {
		n--;
		mop->num_matrixes = n;
	} else {
		sprintf(dbg, "Specified matrix %s was not found. Nothing deleted.", 
		MXID(mxp) ); DBG(dbg); 
		n = y;
	}
	
	return n;
}

unsigned int matrix_del_mxp_str(struct matrix_op_s *mop, char *mxstrp)
{
	unsigned int n;
	struct matrix_dsc_s *mxp;
	
	mxp = mop2mxp_by_str(mop, mxstrp);
	n = matrix_del_mxp(mop, mxp);
	
	return n;
}

/* ---- Compute the inverse matrix of matrix A ---- */
void matrix_op_inv(struct matrix_op_s *mop)
{
    unsigned int n, m;
    struct matrix_dsc_s *imxp, *mxap;

    mxap = mop->mpp[ mop->ia ];
    n = mxap->num_rows;
    m = mxap->num_cols;
    
    if ( n != m ) 
        return ;
    
    matrix_op_dup(mop);    // Make a backup copy of the original A
    //matrix_op_sort_A(mop); // Sort matrix A

    matrix_op_cid(mop, n); // Create the identify matrix for the inverted matrix
    imxp  = mop->mpp[ mop->id ];

    matrix_compute_inv(mxap, imxp);

}

/*
 *  Compute the inverse matrix of a (nonsingular) mxap and store the result in
 *  invmxp.  Storage space should already be allocated for invmxp.
 */
void matrix_compute_inv(struct matrix_dsc_s *mxp, struct matrix_dsc_s *invmxp)
{
    int i, j, n, m, k;
    double mu, *ip, *iip, *ap, *bp;
    struct matrix_dsc_s *mxap;
    char id[256];

    mxap = matrix_dup(mxp);
  
    n = mxap->num_rows;
    m = mxap->num_cols;
    if ( n != m ) return;
    
    invmxp->num_rows = n;
    invmxp->num_cols = n;
    
    matrix_set_id(invmxp);  // set invmxp to identity matrix first

    if (invmxp->id)
        free(invmxp->id);
    
    sprintf(id, "Inverse matrix of %s", invmxp->id);
    invmxp->id = strdup(id);

    /** Forward elimination **/
    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];
        ip = invmxp->rowp[i];

        for ( j = i; j < n ; j++ ) { 
            if ( i == j ) {
                mu = ap[i];
                if ( mu != 1.0 ) {
                    for ( k = i; k < n; k++ ) {
                        ap[k] /= mu;
                    }
                    for ( k = 0; k < n; k++ ) {
                        ip[k] /= mu;
                    }
                    //sprintf(dbg, "1111  i=%d j=%d mu = %g", i, j, mu); DBG(dbg); 
                }
            
            } else {

                bp = mxap->rowp[j];
                iip = invmxp->rowp[j];

                mu = bp[i] / ap[i];
                //sprintf(dbg, "2222  i=%d j=%d ap[i]=%g bp[i]=%g  mu = %g", i, j, ap[i], bp[i], mu); DBG(dbg); 
                for ( k = i;  k < n; k++ ) {
                    bp[k] -= ap[k] * mu;
                }
                for ( k = 0;  k < n; k++ ) {
                    iip[k] -= ip[k] * mu;
                }
            }
        }
    }
    
    
    /** Backward elimination **/
    for ( i = n-1; i > 0; i-- ) {
        ap = mxap->rowp[i];
        ip = invmxp->rowp[i];
        for ( j = i-1; j >= 0 ; j-- ) { 
            bp = mxap->rowp[j];
            iip = invmxp->rowp[j];

            if ( bp[i] == 0.0 )
                continue;

            mu = bp[i] / ap[i];

            for ( k = 0; k < n; k++ ) {
                 bp[k] -= ap[k] * mu; 
                iip[k] -= ip[k] * mu;
            }
        }
    }
    
    matrix_dsc_fini(mxap);   // release the duplicate matrix.
}

/*
 *  Create an n x n identify matrix
 */
void matrix_op_cid(struct matrix_op_s *mop, unsigned int num_rows)
{
    unsigned int i, j, n;
    struct matrix_dsc_s *mxp;
    double *rp;

    n = num_rows;
    mxp = matrix_create("identity", n, n);
    
    for ( i = 0; i < n; i++ ) {
        rp = mxp->rowp[i];
        for ( j = 0; j < n; j++ ) {
            if ( i == j ) 
                rp[j] = 1.0;
            else
                rp[j] = 0.0;
        }
    }
    
    matrix_add_mxp(mop, mxp);
    mop->id = mop->num_matrixes - 1;
}

/** Create an m x n matrix with all values set to 1. **/
struct matrix_dsc_s *matrix_ones(unsigned int num_rows, unsigned int num_cols)
{
    unsigned int i, j, m, n;
    struct matrix_dsc_s *mxp;
    double **rpp, *rp;
	
	m = num_rows, n = num_cols;
	mxp = matrix_create("ones ", m, n);

	rpp = MXRPP(mxp);
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ )
			rp[j] = 1;
	}

	return mxp;
}


/*
 *  Create an n x n identify matrix
 *  version 2, return a pointer to the id matrix created.
 */
struct matrix_dsc_s *matrix_create_eye(unsigned int num_rows)
{
    unsigned int i, j, n;
    struct matrix_dsc_s *mxp;
    double *rp;

    n = num_rows;
    mxp = matrix_create("identity", n, n);
    
    for ( i = 0; i < n; i++ ) {
        rp = mxp->rowp[i];
        for ( j = 0; j < n; j++ ) {
            if ( i == j ) 
                rp[j] = 1.0;
            else
                rp[j] = 0.0;
        }
    }
    
    return mxp;
}

/** set a square matrix to its identity: I matrix **/
void matrix_set_id(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, m, n;
    double *rp;

    n = MXROWS(mxp);
    m = MXCOLS(mxp);
    
    if ( n == m ) {
        for ( i = 0; i < n; i++ ) {
            rp = mxp->rowp[i];
            for ( j = 0; j < n; j++ ) {
                if ( i == j ) 
                    rp[j] = 1.0;
                else
                    rp[j] = 0.0;
            }
        } 
    } else {
        sprintf(dbg, " Matrix %s is not a square matrix. Do nothing.", mxp->id);
        DBG(dbg);     
    }
}

void  matrix_update_idstring(struct matrix_dsc_s *mxp, char *idstrp)
{
	if (mxp->id )
		free(mxp->id);

	mxp->id = strdup(idstrp);
}

/*
 *  (i, j) specifies the direct indecies of row and column, both
 *  start with 0.
 *
 */

//inline void matrix_set_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j, double val)
void matrix_set_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j, double val)
{
    unsigned int row, col;
    double *rp, v;
    
    assert( (i < mxp->num_rows) || (j < mxp->num_cols) );
    row = i, col = j, v = val;
    rp = mxp->rowp[row];
    rp[col] = v;
}

/*
 *  (i, j) specifies the direct indecies of row and column, both
 *  start with 0.
 *
 */
//inline double matrix_get_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j)
double matrix_get_value(struct matrix_dsc_s *mxp, unsigned int i, unsigned int j)
{
    unsigned int row, col;
    double *rp, v;
    
    assert( (i < mxp->num_rows) || (j < mxp->num_cols));
    row = i, col = j;
    rp = mxp->rowp[row];
    v = rp[col];

    return v;
}


void matrix_set_colval(struct matrix_dsc_s *mxp, unsigned int ci, double val)
{
	unsigned int i, j, m, n;
	double **rpp, *rp;
	
	rpp = MXRPP(mxp);
	n = MXROWS(mxp),  m = MXCOLS(mxp);
	if ( ci < m ) {
		for ( i = 0; i < n; i++ ) {
			rp = rpp[i];
			rp[ci] = val;
		}
	}
}

void matrix_set_rowval(struct matrix_dsc_s *mxp, unsigned int ri, double val)
{
	unsigned int i, j, m, n;
	double **rpp, *rp;
	
	rpp = MXRPP(mxp);
	n = MXROWS(mxp),  m = MXCOLS(mxp);
	if ( ri < n ) {
		rp = rpp[ri];
		for ( i = 0; i < m; i++ ) {
			rp[i] = val;
		}
	}
}


/*
 *  Compute the adjoint matrix of matrix A (indexed by mop->ia).
 *  Adjoint matrix can be used to compute the inverse matrix by defintion. 
 *  (But it is not a smart way because the amount of computation simply too much.)
 *
 *  In linear transformation, an adjoint matrix has some other useful 
 *  properpties, such as
 *           u (A v) = (A* u) v
 *
 */
void matrix_op_adj(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m, k, l, r, rn, cn, *rnp, *cnp;
    double *adjrp;
    struct matrix_dsc_s *mxp, *adjmxp, *mxap;
    double det, *rp; 
    
    matrix_op_dup(mop);  // make a copy of matrix A
    mop->iadj = mop->idup;
    mop->idup = mop->ia; // Save the matrix A index first, since mop->ia will be holding the determinant.

      mxap = mop->mpp[ mop->ia ];
    adjmxp = mop->mpp[ mop->iadj ];
    
    // compute the determinant of A
    matrix_op_det(mop);

    n = adjmxp->num_rows;

    mxp = matrix_create("Cofactor matrix for computing the adjoint matrix", n-1, n-1);
    mop->ia = matrix_add_mxp(mop, mxp);  // because matrix_op_det() only uses mop->ia.
    
    matrix_print(mxap);

    for ( i = 0; i < n; i++ ) {
        adjrp = adjmxp->rowp[i];
        for ( j = 0; j < n; j++ ) {
            matrix_cofactor_ij(mxp, mxap, i, j);
            
            sprintf(dbg, "(i, j) = (%d, %d)", i, j); DBG(dbg); 
               matrix_print(mxp);
               DBG("***********"); 
            matrix_op_det(mop);   // compute the determinant of  this cofactor sub-matrix at [i, j]
            adjrp[j] = mxp->det;  // Store the element in the adjoint matrix
        }
    }
    
    matrix_print(adjmxp);

    matrix_mul_ab(mxap, adjmxp);
    matrix_print(mxap);
    //matrix_dsc_fini(mxp); // <--- Since mxp is added to mop, it will be released in matrix_op_fini();
}

/*
 *  Construct the cofactor sub-matrix in computing the adjoint matrix.
 *  i, j are the current row and column index whose element is being
 *  computed.  ijmxp is the cofactor sub-matrix being computed.
 */
void matrix_cofactor_ij(struct matrix_dsc_s *ijmxp, struct matrix_dsc_s *mxp, unsigned int i, unsigned int j)
{
    unsigned int  k, l, m, n, rn, cn;
    double *ijrp, *rp ;

    m = mxp->num_rows;
    n = mxp->num_cols;
    rn = 0;
    for ( k = 0; k < m; k++) {
        if ( k != i ) {
            rp = mxp->rowp[k];
            ijrp = ijmxp->rowp[rn];
            cn = 0;
            for ( l = 0; l < n; l++) {
                if ( l != j ) {
                    ijrp[cn++] = rp[l];
                }
            }
            rn++;
        }
    }
}

/*
 *  Compute the determinant of matrix A by using Cramer's rule
 *  (cofactor expansion).
 *  result is stored in mxp->det.  [ Matrix A ]
 */
void matrix_op_det(struct matrix_op_s *mop)
{
    unsigned int i, j, n, r, k, rn, cn, *rnp, *cnp;
    struct matrix_dsc_s *mxp;
    double det, *rp; 

    mxp = mop->mpp[ mop->ia ];  // matrix A
    n = mxp->num_rows;
    rnp = malloc( n * sizeof(unsigned int) );    
    cnp = malloc( n * sizeof(unsigned int) );    

    if ( n == 2 ) {
        for ( i = 0; i < 2; i++ ) {
            rnp[i] = i;
            cnp[i] = i;
        }
        det = matrix_det_rec(mxp, 0, 0, rnp, cnp, n);
    } else {
        r = n - 1;
        k = 0; 
        rn = 0; // Expand along this row ...
        for ( i = 0; i < n; i++ ) {
            if ( i != rn ) { 
                rnp[k++] = i;
            }
        }

        det = 0.0;
        rp = mxp->rowp[rn];

        /** Expand along the row specified by rn. **/
        for ( j = 0; j < n; j++ ) {
            cn = j;   //--- current column number ---

            // ctr the column index for sub-matrix expansion
            k = 0;
            for ( i = 0; i < n; i++ ) { 
                if ( i != cn ) { // Excluding this cn column.    
                    cnp[k++] = i;
                }
            }

            // Accumulate the determinant of each sub-matrix.
            if ( rp[j] == 0.0 ) continue;
            det += (matrix_det_sign(rn, cn) * rp[j] * 
                    matrix_det_rec(mxp, rn, cn, rnp, cnp, r) );
        }

    }
    free(rnp);   free(cnp);
    
    mxp->det = det;
}


double matrix_det_rec(struct matrix_dsc_s *mxp, unsigned int xrn, unsigned int xcn, unsigned int *rnp, unsigned int *cnp, unsigned int rank)
{
    unsigned int i, j, k, r, rn, cn, *sub_rnp, *sub_cnp;
    double *r1p, *r2p, *rp, det, a11, a12, a21, a22;

    if ( rank == 2 ) {
        r1p = mxp->rowp[ rnp[0] ];  // the 1st row 
        r2p = mxp->rowp[ rnp[1] ];  // the 2nd row 

        a11 = r1p[ cnp[0] ];   a12 = r1p[ cnp[1] ];
        a21 = r2p[ cnp[0] ];   a22 = r2p[ cnp[1] ];
        
        det = a11 * a22 - a21 * a12;
    
    } else {
        
        det = 0.0;
        r = rank - 1;

        sub_rnp = malloc( r * sizeof(unsigned int) );
        sub_cnp = malloc( r * sizeof(unsigned int) );

        /** 1. Construct the row index array for the sub-matrix **/
        rn = 0, k = 0;  // Need to write a routine to pick rn from rnp[].
        for ( i = 0; i < rank; i++ ) {
            if ( i != rn )
                sub_rnp[k++] = rnp[i]; 
        }

        /** 2. Expand along the row specified by rn. **/
        rp = mxp->rowp[ rnp[rn] ];
        for ( j = 0; j < rank; j++ ) {
            cn = j;

            k = 0;
            for ( i = 0; i < rank; i++ ) {
                if ( i != cn )
                    sub_cnp[k++] = cnp[i];
            }
    
            if ( rp[ cnp[cn] ] == 0.0 ) continue;

            det += ( matrix_det_sign(rn, cn) * rp[ cnp[cn] ] *
                matrix_det_rec(mxp, rn, cn, sub_rnp, sub_cnp, r) );
        }

        free(sub_rnp);
        free(sub_cnp);
    }
    

    return det;
}


double matrix_determinant(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, r, k, rn, cn, *rnp, *cnp;
    double det, *rp; 

    n = mxp->num_rows;
    rnp = malloc( n * sizeof(unsigned int) );    
    cnp = malloc( n * sizeof(unsigned int) );    

    if ( n == 2 ) {
        for ( i = 0; i < 2; i++ ) {
            rnp[i] = i;
            cnp[i] = i;
        }
        det = matrix_det_rec(mxp, 0, 0, rnp, cnp, n);
    } else {
        r = n - 1;
        k = 0; 
        rn = 0; // Expand along this row ...
        for ( i = 0; i < n; i++ ) {
            if ( i != rn ) rnp[k++] = i;
        }

        det = 0.0;
        rp = mxp->rowp[rn];

        /** Expand along the row specified by rn. **/
        for ( j = 0; j < n; j++ ) {
            cn = j;   //--- current column number ---

            // ctr the column index for sub-matrix expansion
            k = 0;
            for ( i = 0; i < n; i++ ) { 
                if ( i != cn )  // Excluding this cn column.    
                    cnp[k++] = i;
            }

            // Accumulate the determinant of each sub-matrix.
            if ( rp[j] == 0.0 ) continue;
            det += (matrix_det_sign(rn, cn) * rp[j] * 
                    matrix_det_rec(mxp, rn, cn, rnp, cnp, r) );
        }

    }
    free(rnp);   free(cnp);
    
    mxp->det = det;

    return det;
}

/*
 *  Compute the solution matrix by iterative method. 
 *  mop->jcb_iter = 1 means Jacobi iterative method
 *  mop->jcb_iter = 0 means Gauss-Seidel method
 */
void matrix_op_jcb(struct matrix_op_s *mop)
{
    unsigned int i, j, k, n, m;
    struct matrix_dsc_s *mxap, *mxbp, *mxp;
    double xi, *rp, *nrp, *xp, *bp, aii, sum; 

    sprintf(dbg, "%s", mop->jcb_iter ? "Jacobi iterative":"Guass-Seidel iterative"); DBG(dbg); 
    mxap = mop->mpp[ mop->ia ];  // matrix A
    mxbp = mop->mpp[ mop->ib ];  // matrix b
    mxp  = mop->mpp[ mop->ix ];  // matrix b

    matrix_op_dup(mop);
    matrix_op_aug(mop);
    
    n = mxap->num_rows;
    m = mxap->num_cols;

    //1. Element a[i][i] cannot be zero. Make sure this is the case.
    for ( i = 0; i < n; i++ ) {
        rp = mxap->rowp[i];
        if ( rp[i] == 0.0 ) {
            for ( j = i+1; j < m; j++ ) {
                nrp = mxap->rowp[j];
                if ( nrp[i] != 0.0 ) {
                    matrix_swrow(mxap, i, j);
                    break;
                }
            }

            rp = mxap->rowp[i];
            if ( rp[i] == 0.0 ) {
                sprintf(dbg, "Fatal error a[%d][%d] in matrix A is 0.", i, i);
                DBG(dbg); 
                exit(1);
            }
        }
    }

    //2. Normalize the augmented matrix A by a[i][i].
    for ( i = 0; i < n; i++     ) {
        rp = mxap->rowp[i];
        aii = rp[i];
        for ( j = 0; j < m; j++ ) {  // include the augmented column
            rp[j] /= aii;        
        }

        bp = mxbp->rowp[i];
        bp[0] /= -aii;  // negate the b matrix -x = -b+ [A-x]
    }
    
    
    // 3. The iterative method
    for ( k = 0; k < 8; k++ ) {
        for ( i = 0; i < n; i++ ) {
            rp = mxap->rowp[i];
            sum = 0.0;
            for ( j = 0; j < m; j++ ) {
                if ( i != j ) {
                    xp = mxp->rowp[j];
                    xi = xp[0];            
                    sum += rp[j] * xi;
                }
            }
            xp = mxp->rowp[i];
            bp = mxbp->rowp[i];
            if ( mop->jcb_iter ) 
                rp[i] = -(sum + bp[0]);  // Store the individual solution temporarily here.
            else
                xp[0] = -(sum + bp[0]);

        }
    
        // Transfer the solution to matrix x.
        if ( mop->jcb_iter )
            for ( i = 0; i < n; i++ ) {
                rp = mxap->rowp[i];
                xp = mxp->rowp[i];
                xp[0] = rp[i];
            }

        matrix_print(mxp);
    }
}

/*
 *  Compute the QR decomposition of matrix A 
 *  The decomposition will be stored in
 *    Matrix Q   mop->mpp[ mop->iq ]
 *    Matrix R   mop->mpp[ mop->ir ]
 *
 *  These are the indexes for matrixes Q and R are mop->iq and mop->ir respectively.
 */
void matrix_op_qr(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m;
    struct matrix_dsc_s *mxap, *mqp, *mxrp;
    double *qp, *up, *vp, *rp; 


    mxap = mop->mpp[ mop->ia ];  // matrix A
    matrix_xpose(mxap);          // 1. Just for convenience, we need operate by rows
                                 // That's why we transpose here.

    matrix_op_dup(mop);
    mop->iq = mop->idup;
    mqp = mop->mpp[ mop->iq ];    // 2. matrix QR decompose: the Q matrix

    n = mqp->num_rows;
    m = mqp->num_cols;  
    vp = malloc( m * sizeof(double) );

    for ( i = 1; i < n; i++ ) {   // 3. Generate each orthogonal vector starting with row 2.
        qp = mqp->rowp[i];
        up = mxap->rowp[i];
        
        for ( j = 0; j < i; j++ ) {
            vector_dup(m, vp, mqp->rowp[j]);   // copy the orthogonal vector vp
            vector_u_proj_on_v(m, up, vp);  // proj qp onto vp
            vector_u_minus_v(m, qp, vp);    // qp - vp: i.e. removing all the components that were already projected onto the othogonal basis vectors.
        }
    }

    // 4.  Normalize each row vector in the (transposed) Q matrix.
    for ( i = 0; i < n; i++ ) {     
        qp = mqp->rowp[i];
        vector_normalize(m, qp);
    }



    // 5. Generate the R matrix in QR decomposition.
    // Remember, at this step, matrix A is still in transposed state.
    // This is very convenient for row-based inner vector operation.
    mxrp = matrix_create("Matrix R in QR decompose ", n, m);

    mop->mpp = realloc( mop->mpp, (mop->num_matrixes + 1) * sizeof(struct matrix_dsc_s *) );
    mop->ir = mop->num_matrixes;
     mop->mpp[ mop->num_matrixes++ ] = mxrp;  

    for ( i = 0; i < n; i++ ) {
        rp = mxrp->rowp[i];
        qp = mqp->rowp[i];    // column vector from Q matrix
        for ( j = i; j < m; j++ ) {
            up = mxap->rowp[j];   // column vector from A matrix
            rp[j] = vectors_inner_prod(m, up, qp);
        }
    }

    // 6. Transpose back to get the actual Q matrix.
    matrix_xpose(mxap);  // Transpose A back to its original state.
    matrix_xpose(mqp);   // Transpose Q to its correct state.

    free(vp);
}

/* 
 * Compute the LU decomposition of matrix A
 *  Matrix L is  mop->mpp[ mop->il ]
 *  Matrix U is  mop->mpp[ mop->iu ]
 */

void matrix_op_lud(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m, k;
    struct matrix_dsc_s *mxp, *invmxp, *mxap, *mxlp;
    double mu, *ap, *bp, *ip, *iip;

    mxap = mop->mpp[ mop->ia ];
    n = mxap->num_rows;
    m = mxap->num_cols;
    
    if ( n != m ) return;
    
    matrix_op_dup(mop);  // The A matrix will be holding U matrix.
    mop->iu = mop->ia;
    if ( mxap->id ) {
        free(mxap->id);
    }
    mxap->id = strdup("LU U matrix");
    
    mxp = matrix_create("Temp matrix -->", n, n); // Use as an E
    matrix_set_id(mxp);   // Set it to  identity
    invmxp = matrix_create("Temp matrix for computed inverted matrix", n, n);

    // L = E1(-1) * E2(-1) * ... * En(-1)
    mxlp = matrix_create("LU L matrix", n, n);
    matrix_set_id(mxlp);
    matrix_add_mxp(mop, mxlp);
    mop->il = mop->num_matrixes - 1;

    for ( i = 0; i < n; i++ ) {
        lud_sub_get_ap_ip(mxap, mxp, i, &ap, &ip);

        for ( j = i; j < n; j++ ) {
            matrix_set_id(mxp);
            bp = mxap->rowp[j];
            iip = mxp->rowp[j];
            
            if ( i == j ) {
                mu = ap[i];
                if ( mu != 1.0 ) {
                    for ( k = i; k < n; k++ ) {
                        ap[k] /= mu;
                    }
                    for ( k = 0; k < n; k++ ) {
                        ip[k] /= mu;
                    }

                    matrix_print(mxp);
                    matrix_compute_inv(mxp, invmxp);  // compute the inverted matrix of mxp
                    matrix_mul_ab(mxlp, invmxp);
                }

            } else {

                mu = -bp[i] / ap[i];
                for ( k = i;  k < n; k++ ) {
                    bp[k] += ap[k] * mu;
                }
                for ( k = 0;  k < n; k++ ) {
                    iip[k] += ip[k] * mu;
                }
                matrix_print(mxp);
                matrix_compute_inv(mxp, invmxp);  // compute the inverted matrix of mxp
                matrix_mul_ab(mxlp, invmxp);
            }
        }
    }
    
    matrix_dsc_fini(invmxp);
}

/*
 *   Trigdiagonal matrix LU decomposition.
 *              A = L U
 *      +-                                         -+
 *      |  a11  a12    0    0.................. 0   |
 *      |  a21  a22   a23     .                 .   |
 *      |   0   a32   a33  a34  .               .   |
 *      |   ..      .     .    .  .             .   |
 * A =  |   .   .     .     .    .  .           .   |
 *      |   .      .    .    .     .  .         .   |
 *      |   .        .    .    .     .  .       .   |
 *      |   .          .    .    .     .  .     .   |
 *      |   .            .    .    .     .  .   .   |
 *      |   .              .    .    .     .  . 0   |
 *      |   .                .    .    .   . an-1,n |
 *      |   0.................0   0   an,n-1  an,n  |
 *      +-                                         -+
 *  
 *      +-                                      -+      +-                                    -+
 *      |  l11   0   ....................... 0   |      |  1   u12    0  ............... 0   |
 *      |  l21  l22  0   .                   .   |      |  0   1    u23   .              .   |
 *      |   0   l23  l33    .                .   |      |  .    .   1        .    a      .   |
 *      |   .    .       .     .             .   |  U=  |  .        .     .     .        .   |    
 * L =  |   .       .       .     .          .   |      |  .            .     .     .    .   |  
 *      |   .           .      .     .       .   |      |  .                .    .       .   |
 *      |   .               .     .     .    .   |      |  .                    .    .       |
 *      |   .                  .      .      0   |      |  .                       .   un-1,n|
 *      |   0 ...................0  ln,n-1 ln,n  |      |  0 ........................0    1  |
 *      +-                                      -+      +-                                  -+
 *
 */
void matrix_op_dgd(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m, k;
    struct matrix_dsc_s *mxap,  *mxup, *mxlp;
    double *ap, *lp, *up, *last_up, **app, **lpp, **upp;

    mxap = mop->mpp[ mop->ia ];
    n = mxap->num_rows;
    m = mxap->num_cols;
    
    if ( n != m ) return;
    
    mxup = matrix_create("LU U matrix", n, n);
    mop->iu = mop->num_matrixes;
    matrix_add_mxp(mop, mxup);

    mxlp = matrix_create("LU L matrix", n, n);
    mop->il = mop->num_matrixes;
    matrix_add_mxp(mop, mxlp);

    matrix_set_id(mxup);  // Set U the identify matrix first

    app = mxap->rowp;    ap = app[0];       
    lpp = mxlp->rowp;    lp = lpp[0];
    upp = mxup->rowp;    up = upp[0];
    
    lp[0] = ap[0];    up[1] = ap[1] / lp[0];

    for ( i = 1; i < n; i++ ) {
        ap = app[i];    lp = lpp[i];    up = upp[i];
        j = i - 1;      k = i + 1;  last_up = upp[j];
        lp[j] = ap[j];
        lp[i] = ap[i] - lp[j] * last_up[i];
        if ( i != (n-1) )  // The last row won't get update in U matrix.
            up[k] = ap[k] / lp[i];
    }

}

/** Matrix multiplication: A * B = C, matrix C must be already created on entry to this routine **/
void matrix_mul_abc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k;
    double *ap, *bp, *cp, accum;

    n = MXROWS(mxap);
    m = MXCOLS(mxap);

    if ( m != mxbp->num_rows ) { 
        sprintf(dbg, " A # cols (%d) != B # rows(%d); do nothing",
            m, mxbp->num_rows); DBG(dbg);
        return;
    }

    if (n != mxcp->num_rows) {
		MXSIZE(mxap);   MXSIZE(mxbp);   MXSIZE(mxcp);
        sprintf(dbg, " A # rows (%d) != C # rows(%d); do nothing",
            n, mxcp->num_rows); DBG(dbg);
        return;
    } 
    
    if ( mxbp->num_cols != mxcp->num_cols ) {
        sprintf(dbg, " B # cols (%d) != C # cols(%d); do nothing",
            mxbp->num_cols, mxcp->num_cols); DBG(dbg);
        return;
    }

    k = mxbp->num_cols;

    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];    // Move along the rows of A
        cp = mxcp->rowp[i];
        for ( j = 0; j < k; j++ ) {  // Move along the cols of B
            accum = 0.0;
            for ( l = 0; l < m; l++ ) {  // l is column # for A
                bp = mxbp->rowp[l];      // and is row # for B.
                accum += ap[l] * bp[j];        
            }
            //sprintf(dbg, "i,j=%d, %d  accum = %g", i, j, accum); DBG(dbg); 
            cp[j] = accum;
        }
    }
    
}

/*
 *  This is not a standard routine to do matrix multiplication.
 *  This is for machine learning only, where the product of matrix 
 *  multiplication is stored in an augmented matrix whose first column
 *  is filled with a biased '1', which is also done in this multiplication.
 *
 */
void matrix_mul_ab_augc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k;
    double *ap, *bp, *cp, accum;

    n = mxap->num_rows;
    m = mxap->num_cols;

    k = mxbp->num_cols;

    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];    // Move along the rows of A
        cp = mxcp->rowp[i];
        cp[0] = 1;
        for ( j = 0; j < k; j++ ) {  // Move along the cols of A
            accum = 0.0;
            for ( l = 0; l < m; l++ ) {  // l is column # for A
                bp = mxbp->rowp[l];      // and is row # for B.
                accum += ap[l] * bp[j];        
            }
            //sprintf(dbg, "i,j=%d, %d  accum = %g", i, j, accum); DBG(dbg); 
            cp[j+1] = accum;   // The result is stored at columns 1, 2, ..., m-1.
        }                      // Note the 0-th column is not used.
    }

}

/*
 *
 *  This is not a standard routine to do matrix multiplication. 
 *  It is intended for use in the special cases of machine learning, 
 *  which often requires the *transposed matrix* multiplication. To
 *  avoid the "cost" of matrix transpose.  This routine was written.
 *  This does the transposed matrix multiplication of 
 *
 *      A.transpose() * B = C
 *
 *  without physically transposed matrix A.   All the storage for
 *  matrices A, B and C is already created upon entry of this routine.
 *   ROW#(A)  ==  ROW#(C);  COL#(B)  ==  COL#(C)
 *   COL#(A)  ==  COL#(B)
 *                                                  By matrix-matrix dot-product,
 *     A.T()     *    B       =         C           
 *    (3 x 2).T()   (3 x 2)          (2 X 2)        Acol1 * Bcol1 = CR11    Acol1 *Bcol2  = CR12
 *   +--------+    +--------+      +----------+                     (89)                    (98)
 *   | 1   2  |    | 7   8  |      | 89    98 |   
 *   +--------+    +--------+      +----------+     Acol2 * Bcol1 = CR21    Acol2 *Bcol2  = CR22
 *   | 3   4  |    | 9   10 |      | 116  128 |                    (116)                   (128)
 *   +--------+    +--------+      +----------+
 *   | 5   6  |    | 11  12 |
 *   +--------+    +--------+
 *
 */
void matrix_mul_aTbc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k, ai, bi, ci, mc;
    double *ap, *bp, *cp, accum;

    mc = MXROWS_EQ(mxap, mxbp) && EQ(MXCOLS(mxap), MXROWS(mxcp)) && MXCOLS_EQ(mxbp, mxcp);
    if ( !mc ) {
        DBG("Error: for C = A.T * B  to operation, COL#(A)  ==  ROW#(C); ROW#(A)  == ROW#(B) and ROW#(C)  ==  COL#(B) ");
        MXSIZE(mxap);  MXSIZE(mxbp);  MXSIZE(mxcp);
        return;
    }
    
    k = MXCOLS(mxap);
    m = MXROWS(mxap);  
    n = MXCOLS(mxbp);
    for (ai = 0; ai < k; ai++ ) {  // Move along the rows of C
        cp = mxcp->rowp[ai];
        for (bi = 0; bi < n; bi++ ) { // Move along the columns of C
            accum = 0.0;
            for ( i = 0; i < m; i++ ) {
                ap = mxap->rowp[i];   
                bp = mxbp->rowp[i];   
                accum += ap[ai] * bp[bi];
            }
            cp[bi] = accum;
        }
    }

}

/*
 *
 *  This is not a standard routine to do matrix multiplication. 
 *  It is intended for use in the special cases of machine learning, 
 *  which often requires the *transposed matrix* multiplication. To
 *  avoid the "cost" of matrix transpose.  This routine was written.
 *  This does the transposed matrix multiplication of 
 *
 *      A.transpose() * B = C
 *
 *  without physically transposed matrix A.   All the storage for
 *  matrices A, B and C is already created upon entry of this routine.
 *   ROW#(A)  ==  ROW#(C);  COL#(B)  ==  COL#(C)
 *   COL#(A)  ==  COL#(B)
 *                                                 By matrix-matrix dot-product,
 *     A.deaug()  *    B           =     C           
 *    (3 x 3(2))        (2 x 3)          (3 X 3)   
 *   +-----------+   +----------+    +------------+   
 *   |1 | 1   2  |   | 7  8   9 |    |27   30  33 |   
 *   +-----------+   +----------+    +------------+  
 *   |1 | 3   4  |   | 10 11 12 |    |61   68  75 |      
 *   +-----------+   +----------+    +------------+
 *   |1 | 5   6  |                   |95  106 117 |
 *   +-----------+                   +------------+
 *
 *    By matrix-matrix dot product: 
 *
 *    Arow1 * Bcol1 = CR11    Arow1 *Bcol2 = CR12   Arow1 *Bcol3 = CR12  
 *                    (27)                    (30)                 (33)
 *    Arow2 * Bcol1 = CR21    Arow2 *Bcol2 = CR22   Arow2 *Bcol3 = CR23
 *                    (61)                    (68)                 (75)
 *    Arow3 * Bcol1 = CR31    Arow3 *Bcol2 = CR32   Arow3 *Bcol3 = CR33  
 *                    (95)                   (106)                (117)
 *
 */
void matrix_mul_adeaug_bc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k, ai, bi, ci, mc;
    double *ap, *bp, *cp, accum;


    mc = MXROWS_EQ_COLS(mxap, mxbp) && MXROWS_EQ(mxap, mxcp) && MXCOLS_EQ(mxbp, mxcp)
                                                && EQ( MXCOLS(mxap)-1, MXROWS(mxbp) );
    if ( !mc ) {
        DBG("Error: for C = A.deaug() * B  to operation, COL#(A)  ==  ROW#(C); ROW#(A)  == ROW#(B) and ROW#(C)  ==  COL#(B) ");
        MXSIZE(mxap);  MXSIZE(mxbp);  MXSIZE(mxcp);
        return;
    }

    m = MXROWS(mxap);  // 
    n = MXCOLS(mxbp);
    k = MXROWS(mxap)-1;
    for (ai = 0; ai < m; ai++){ // Move along the rows of A and rows of C
        ap = mxap->rowp[ai];  
        cp = mxcp->rowp[ai];
        for (bi = 0; bi < n; bi++) {  // Move along the cols of B
            accum = 0.0;
            for ( i = 0; i < k; i++ ) {
                bp = mxbp->rowp[i];   // Get the row point of B
                accum += ap[i+1] * bp[bi]; // dot product
            }
            //sprintf(dbg, " accum = %g", accum); DBG(dbg); 
            cp[bi] = accum;  // Set C's index by bi.
        }
    }

}

/*
 *
 *  This is not a standard routine to do matrix multiplication. 
 *  It is intended for use in the special cases of machine learning, 
 *  which often requires the *transposed matrix* multiplication. To
 *  avoid the "cost" of matrix transpose.  This routine was written.
 *  This does the transposed matrix multiplication of 
 *
 *      A.transpose() * B = C
 *
 *  without physically transposed matrix A.   All the storage for
 *  matrices A, B and C is already created upon entry of this routine.
 *   ROW#(A)  ==  ROW#(C);  COL#(B)  ==  COL#(C)
 *   COL#(A)  ==  COL#(B)
 *                                                  By matrix-matrix dot-product,
 *     A.T()     *    B       =         C           
 *    (3 x 3).T()   (3 x 2)          (2 X 2)        Acol1 * Bcol1 = CR11    Acol1 *Bcol2  = CR12
 *   +-----------+    +--------+      +----------+                     (89)                    (98)
 *   |1 | 1   2  |    | 7   8  |      | 89    98 |   
 *   +-----------+    +--------+      +----------+     Acol2 * Bcol1 = CR21    Acol2 *Bcol2  = CR22
 *   |1 | 3   4  |    | 9   10 |      | 116  128 |                    (116)                   (128)
 *   +-----------+    +--------+      +----------+
 *   |1 | 5   6  |    | 11  12 |
 *   +-----------+    +--------+
 *
 */
void matrix_mul_adeaugT_bc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k, ai, bi, ci, mc;
    double *ap, *bp, *cp, accum;

    mc = MXROWS_EQ(mxap, mxbp) && EQ(MXCOLS(mxap)-1, MXROWS(mxcp)) && MXCOLS_EQ(mxbp, mxcp);
    if ( !mc ) {
        DBG("Error: for C = A.T * B  to operation, COL#(A)  ==  ROW#(C); ROW#(A)  == ROW#(B) and ROW#(C)  ==  COL#(B) ");
        MXSIZE(mxap);  MXSIZE(mxbp);  MXSIZE(mxcp);
        return;
    }
    
    k = MXCOLS(mxap);
    m = MXROWS(mxap);  // 
    n = MXCOLS(mxbp);
    for (ai = 0; ai < k-1; ai++){ // Move along the cols of A and rows of C
        cp = mxcp->rowp[ai];
        for (bi = 0; bi < n; bi++) {  // Move along the rows of B
            accum = 0.0;
            for ( i = 0; i < m; i++ ) {
                ap = mxap->rowp[i];  // Get the column element of A  
                bp = mxbp->rowp[i];  // Get the column element of B
                accum += ap[ai+1] * bp[bi];
            }
            cp[bi] = accum;
        }
    }

}


/*
 *
 *  This is not a standard routine to do matrix multiplication.  It
 *  is intended for use in the special cases of machine learning, 
 *  which often requires the *transposed matrix* multiplication. To
 *  avoid the "cost" of matrix transpose.  This routine was written.
 *  This does the transposed matrix multiplication of 
 *
 *      A * B.transpose() = C
 *
 *  without physically transposing matrix B.  All the storage for
 *  matrices A, B and C is already created upon entry of this routine.
 *   ROW#(A)  ==  ROW#(C);  ROW#(B)  ==  COL#(C)
 *   COL#(A)  ==  COL#(B);
 *
 *    ( 2 x 4)        *      (3 X 4).T()     =       2 X 3 
 *   +-------------+    +---------------+       +----------------+
 *   | 1   2  3  4 |    |11  12  13  14 |       | 130  170  210  |   By matrix-matrix row dot-product
 *   +-------------+    +---------------+       +----------------+ 
 *   | 5   6  7  8 |    |15  16  17  18 |       | 330  434  538  |
 *   +-------------+    +---------------+       +----------------+
 *                      |19  20  21  22 |       AR1*BR1 = CR11  AR1*BR2 = CR12  AR1 * BR3 = CR13 
 *                      +---------------+                 (130)          (170)             (210) 
 *                                              AR2*BR1 = CR21  AR2*BR2 = CR22  AR2 * BR3 = CR23
 *                                                        (330)          (434)             (538)
 */ 
void matrix_mul_abTc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    //struct matrix_dsc_s *mxcp;
    unsigned int i, j, n, m, l, k, ai, bi, ci, mc;
    double *ap, *bp, *cp, accum;

    mc = MXROWS_EQ(mxap, mxcp) && MXCOLS_EQ(mxap, mxbp) && MXROWS_EQ_COLS(mxbp, mxcp);
    if ( !mc ) {
        DBG("Error: for C = A * B.T  to operation, ROW#(A)  ==  ROW#(C); COL#(A)  ==  COL#(B)");
        MXSIZE(mxap);  MXSIZE(mxbp);  MXSIZE(mxcp);
        return;
    }
    
    m = MXROWS(mxap);  // the number of rows of A
    k = MXCOLS(mxap);  // the number of rows of A
    n = MXROWS(mxbp);  // the number of rows of B
    for ( ai = 0; ai < m; ai++ ){ // Move along the rows of A 
        //cp = mxcp->rowp[ai];
        ap = mxap->rowp[ai];
        cp = mxcp->rowp[ai];
        for (bi = 0; bi < n; bi++) {  // Move along the rows of B
            bp = mxbp->rowp[bi];   
            accum = 0.0;
            for ( i = 0; i < k; i++ ) {
                accum += ap[i] * bp[i];
            }
            cp[bi] = accum;
        }
    }

}

/*
 *
 *  This is not a standard routine to do matrix multiplication.  It
 *  is intended for use in the special cases of machine learning, 
 *  which often requires the *transposed matrix* multiplication. To
 *  avoid the "cost" of matrix transpose.  This routine was written.
 *  This does the transposed matrix multiplication of 
 *
 *      A * B.transpose() = C.augment()
 *
 *  without physically transposing matrix B.  All the storage for
 *  matrices A, B and C is already created upon entry of this routine.
 *  The biased column, (0-th column), is set to 1 during the multiplication.
 *   ROW#(A)  ==  ROW#(C);  ROW#(B)  ==  COL#(C)
 *   COL#(A)  ==  COL#(B);
 *
 *    ( 2 x 4)       *      (3 X 4).T()   =       2 X 4(3) 
 *   +-------------+   +---------------+    +----------------+
 *   | 1   2  3  4 |   |11  12  13  14 |    |1  130  170  210|   By matrix-matrix row dot-product
 *   +-------------+   +---------------+    +----------------+ 
 *   | 5   6  7  8 |   |15  16  17  18 |    |1  330  434  538|
 *   +-------------+   +---------------+    +----------------+
 *                     |19  20  21  22 |        AR1*BR1 = CR12  AR1*BR2 = CR13  AR1 * BR3 = CR14 
 *                     +---------------+                 (130)          (170)             (210) 
 *                                              AR2*BR1 = CR22  AR2*BR2 = CR23  AR2 * BR3 = CR24
 *                                                        (330)          (434)             (538)
 */ 
void matrix_mul_abT_augc(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  
                                                                      struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k, ai, bi, ci, mc;
    double *ap, *bp, *cp, accum;

    mc = MXROWS_EQ(mxap, mxcp) && MXCOLS_EQ(mxap, mxbp) && EQ(MXROWS(mxbp)+1, MXCOLS(mxcp));
    if ( !mc ) {
        DBG("Error: for C = A * B.T  to operation, ROW#(A)  ==  ROW#(C); COL#(A)  ==  COL#(B)");
        MXSIZE(mxap);  MXSIZE(mxbp);  MXSIZE(mxcp);
        return;
    }
    
    m = MXROWS(mxap);  // the number of rows of A
    k = MXCOLS(mxap);  // the number of cols of A
    n = MXROWS(mxbp);  // the number of rows of B
    for ( ai = 0; ai < m; ai++ ) { // Move along the rows of A 
        //cp = mxcp->rowp[ai];
        ap = mxap->rowp[ai];
        cp = mxcp->rowp[ai];
        cp[0] = 1.0;
        for (bi = 0; bi < n; bi++) {  // Move along the rows of B
            bp = mxbp->rowp[bi];   
            accum = 0.0;
            for ( i = 0; i < k; i++ ) {
                accum += ap[i] * bp[i];
            }
            cp[bi+1] = accum;
        }
    }

}

/*
 *  This is not a standard routine to do matrix multiplication.
 *  This is for machine learning only, where the product of matrix 
 *  multiplication is stored in an augmented matrix whose first column
 *  is filled with a biased '1.'
 *
 *      C = A.transpose() * B.deaug()
 *
 *        5 X 4              5 X 2         4  X  2
 *    +------------+    +----------+   +-------------+
 *    | 1  2  3  4 |    |1  21  22 |   | 1205   1250 |
 *    +------------+    +----------+   +-------------+
 *    | 5  6  7  8 |    |1  23  24 |   | 1330   1380 |
 *    +------------+    +----------+   +-------------+
 *    | 9 10 11 12 |    |1  25  26 |   | 1455   1510 |
 *    +------------+    +----------+   +-------------+
 *    |13 14 15 16 |    |1  27  28 |   | 1580   1640 |
 *    +------------+    +----------+   +-------------+
 *    |17 18 19 20 |    |1  29  30 |                
 *    +------------+    +----------+                    
 *                       ^
 *                       |
 *                       +-- Note the bias column 0, which contains
 *                           all 1's, is skipped in the multiplication.
 *
 */
void matrix_mul_aT_deaugb_c(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp,  struct matrix_dsc_s *mxcp)
{
    unsigned int i, j, n, m, l, k, ai, bi;
    double *ap, *bp, *cp, accum;

    /*
    mc = MXCOLS_EQ_ROWS(mxap, mxcp) && MXROWS_EQ(mxap, mxbp) && MXCOLS_EQ(mxbp, mxcp);
    if ( !mc ) {
        DBG("Error: for C = A.T * B  to operation, COL#(A)  ==  ROW#(C); ROW#(A)  == ROW#(B) and ROW#(C)  ==  COL#(B) ");
        MXSIZE(mxap);  MXSIZE(mxbp);  MXSIZE(mxcp);
        return;
    }
    */
    m = MXROWS(mxap);  // actually the row of transposed A
    k = MXCOLS(mxap);  // actually the row of transposed A
    n = MXCOLS(mxbp);
    for (ai = 0; ai < k; ai++){ // Move along the columns of A, 
        cp = mxcp->rowp[ai];    // accordingly the rows of C
        for (bi = 1; bi < n; bi++) {  // Move along the rows of A and B
            accum = 0.0;
            for ( i = 0; i < m; i++ ) { // Skip column 0, which is the bias column
                ap = mxap->rowp[i];   
                bp = mxbp->rowp[i];   
                //sprintf(dbg, "a = %g, b = %g", ap[ai], bp[bi]); DBG(dbg); 
                accum += ap[ai] * bp[bi];
            }                   
            cp[bi-1] = accum;
            //sprintf(dbg, " cij = %g ai = %d , bi = %d cp[]=%g", accum, ai, bi, cp[bi-1]); DBG(dbg); 
        }
    }

}

unsigned int mxmul_check_abc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	/*
	 * A * B = C, check if the corresponding dimensions of these matrixes match.
	 *     
	 *         A                B                   C
	 *
	 *    +--- k ---+        j ------>          j ------> 
	 *
	 * i  +---------+   +   +-+-------+    i   +-+-------+
	 * |  |/////////|   |   |/|       |    |   |*|       |
	 * |  +---------+   |   +-+       |    |   +-+       |
	 * |  |         |   |   |/|       |    |   |         |
	 * |  |         |   k   +-+       |    |   |         |
	 * |  |         |   |   |/|       |    |   |         |
	 * |  |         |   |   +-+       |    |   |         |
	 * v  |         |   |   |/|       |    v   |         |
	 *    +---------+   +   +-+-------+        +---------+
	 *                                   
	 *                          * = SUM( A_ik * B_kj) 
	 *                               k
	 */
	unsigned int y, a, b, c;

	a = MXROWS_EQ(amxp, cmxp); 
	b = MXCOLS_EQ(bmxp, cmxp);
	c = MXCOLS_EQ_ROWS(amxp, bmxp); 
	y = a && b && c;
	
	if (!y ) {
		DBG("Error: A * B = C, dimensions do not match."); 
		if ( !a ) {
			MXSIZE(amxp);
			MXSIZE(cmxp);
			sprintf(dbg, "The # of rows of %s (A) and %s (C) are not equal.",
				MXID(amxp), MXID(cmxp) ); DBG(dbg); 
		}
		
		if ( !b ) {
			MXSIZE(bmxp);
			MXSIZE(cmxp);
			sprintf(dbg, "The # of columns of %s (B) and %s (C) are not equal.",
				MXID(bmxp), MXID(cmxp) ); DBG(dbg); 
		}
	
		if ( !c ) {
			MXSIZE(amxp);
			MXSIZE(bmxp);
			sprintf(dbg, "The # of columns of %s (A) and the # of rows of %s (B) are not equal.",
				MXID(amxp), MXID(bmxp) ); DBG(dbg); 
		}
	}

	return y;
}

unsigned int mxmul_check_aTbc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	/* A.T() * B = C, check if the corresponding dimensions of these matrixes match.
	 *     
	 *         A                B                   C
	 *
	 *    i ------->         j ------>          j ------> 
	 *
	 * +  +-+-------+   +   +-+-------+    i   +-+-------+
	 * |  |/|       |   |   |/|       |    |   |*|       |
	 * |  +-+       |   |   +-+       |    |   +-+       |
	 * |  |/|       |   |   |/|       |    |   |         |
	 * k  +-+       |   k   +-+       | =  |   |         |
	 * |  |/|       |   |   |/|       |    |   |         |
	 * |  +-+       |   |   +-+       |    |   |         |
	 * |  |/|       |   |   |/|       |    v   |         |
	 * +  +-+-------+   +   +-+-------+        +---------+
	 *                                   
	 *                          * = SUM( A_ki * B_kj) 
	 *                               k
	 */

	unsigned int y, a, b, c;

	a = MXROWS_EQ(amxp, bmxp); 
	b = MXCOLS_EQ_ROWS(amxp, cmxp);
	c = MXCOLS_EQ(bmxp, cmxp); 
	y = a && b && c;
	 
	if (!y ) {
		DBG("Error: A.T() * B = C, dimensions do not match."); 
		if ( !a ) {
			MXSIZE(amxp);
			MXSIZE(bmxp);
			sprintf(dbg, "The # of rows of %s (A.T()) and %s (B) are not equal.",
				MXID(amxp), MXID(bmxp) ); DBG(dbg); 
		}
		
		if ( !b ) {
			MXSIZE(amxp);
			MXSIZE(cmxp);
			sprintf(dbg, "The #  of columns of %s (A) and the # of rows of %s (C) are not equal.",
				MXID(amxp), MXID(cmxp) ); DBG(dbg); 
		}
	
		if ( !c ) {
			MXSIZE(bmxp);
			MXSIZE(cmxp);
			sprintf(dbg, "The # of columns of %s (B) and %s (C) are not equal.",
				MXID(bmxp), MXID(cmxp) ); DBG(dbg); 
		}
	}

	return y;
}

unsigned int mxmul_check_abTc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	/* A * B.T() = C, check if the corresponding dimensions of these matrixes match.
	 *     
	 *         A                B                   C
	 *
	 *    +--- k ---+                           j ------> 
	 *                      +----k ---+
	 * i  +---------+   j   +---------+    i   +-+-------+
	 * |  |/////////|   |   |/////////|    |   |*|       |
	 * |  +---------+   |   +---------+    |   +-+       |
	 * |  |         |   |   |         |    |   |         |
	 * |  |         |   |   |         |    |   |         |
	 * |  |         |   |   |         |    |   |         |
	 * |  |         |   |   |         |    |   |         |
	 * v  |         |   v   |         |    v   |         |
	 *    +---------+       +---------+        +---------+
	 *                                   
	 *                          * = SUM(A_ik * B_jk) 
	 *                               k
	 */

	unsigned int y, a, b, c;

	a = MXCOLS_EQ(amxp, bmxp);
	b = MXROWS_EQ(amxp, cmxp);
	c = MXROWS_EQ_COLS(bmxp, cmxp); 
	y = a && b && c;

	if (!y ) {
		DBG("Error: A * B.T() = C, dimensions do not match."); 
		if ( !a ) {
			MXSIZE(amxp);
			MXSIZE(bmxp);
			sprintf(dbg, "The # of columns of %s (A) and %s (B.T()) are not equal.",
				MXID(amxp), MXID(bmxp) ); DBG(dbg); 
		}
		
		if ( !b ) {
			MXSIZE(amxp);
			MXSIZE(cmxp);
			sprintf(dbg, "The # of rows of %s (A) and %s (C) are not equal.",
				MXID(amxp), MXID(cmxp) ); DBG(dbg); 
		}
	
		if ( !c ) {
			MXSIZE(bmxp);
			MXSIZE(cmxp);
			sprintf(dbg, "The # of rows of %s (B.T()) and the # of columns of %s(C) are not equal.",
				MXID(bmxp), MXID(cmxp) ); DBG(dbg); 
		}
	}
	
	return y;
}

 
unsigned int nn_matrix_eq(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp)
{
	unsigned int y, a, b;

	y = MXSIZE_EQ(amxp, bmxp);
	
	if ( !y ) {
		sprintf(dbg, "Error: the dimensions of : %s(A), %s(B) are not equal...", 
			MXID(amxp), MXID(bmxp) ); DBG(dbg); 

		sprintf(dbg, "%s (A) (%5d, %5d) and %s(B) (%5d, %5d)",
			MXID(amxp), MXROWS(amxp), MXCOLS(amxp),
			MXID(bmxp), MXROWS(bmxp), MXCOLS(bmxp) ); DBG(dbg); 
	}

	return y;
}

unsigned int nn_matrix_eq3(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	unsigned int y, a, b, c;

	a = MXSIZE_EQ(amxp, bmxp);
	b = MXSIZE_EQ(bmxp, cmxp);
	c = MXSIZE_EQ(cmxp, amxp);
	
	y =  a && b && c;
	
	if ( !y ) {
		sprintf(dbg, "Error: the dimensions of : %s(A), %s(B) and %s(C) are not equal...", 
			MXID(amxp), MXID(bmxp), MXID(cmxp) ); DBG(dbg); 

		if ( !a )  {
			sprintf(dbg, "%s (A) (%5d, %5d) and %s(B) (%5d, %5d)",
				MXID(amxp), MXROWS(amxp), MXCOLS(amxp),
				MXID(bmxp), MXROWS(bmxp), MXCOLS(bmxp) ); DBG(dbg); 
		}
	
		if ( !b )  {
			sprintf(dbg, "%s (B) (%5d, %5d) and %s(C) (%5d, %5d)",
				MXID(amxp), MXROWS(amxp), MXCOLS(amxp),
				MXID(cmxp), MXROWS(cmxp), MXCOLS(cmxp) ); DBG(dbg); 
		}
		
		if ( !c )  {
			sprintf(dbg, "%s (A) (%5d, %5d) and %s(C) (%5d, %5d)",
				MXID(amxp), MXROWS(amxp), MXCOLS(amxp),
				MXID(cmxp), MXROWS(cmxp), MXCOLS(cmxp) ); DBG(dbg); 
		}
	}

	return y;

}

struct matrix_dsc_s *mop2mxp_by_str(struct matrix_op_s *mop, unsigned char *id)
{
	unsigned int i, n, idx;
	struct matrix_dsc_s *mxp, *tmxp;	

	idx = MX_NONIDX;
	tmxp = NULL;
	n = get_num_matrixes(mop);
	
	for ( i = 0; i < n; i++) {
		mxp = MOP2MXP(mop, i);
		if ( !strcmp(MXID(mxp), id) ) {
			idx = i;
			tmxp = mxp;
			break;
		}
	}

	return tmxp;
}

unsigned int mop2mxid_by_str(struct matrix_op_s *mop, unsigned char *id)
{
	unsigned int i, n, idx;
	struct matrix_dsc_s *mxp, *tmxp;	

	idx = MX_NONIDX;
	tmxp = NULL;
	n = get_num_matrixes(mop);
	
	for ( i = 0; i < n; i++) {
		mxp = MOP2MXP(mop, i);
		if ( !strcmp(MXID(mxp), id) ) {
			idx = i;
			tmxp = mxp;
			break;
		}
	}

	return idx;
}

/*
 *  This is not a standard routine to do matrix multiplication.
 *  This is for machine learning only. It computes the scalar
 *  product of the following formula:
 *
 *      scalar product = v.T * A * u
 *
 */
double  matrix_mul_vT_A_u(double *vp, struct matrix_dsc_s *mxp, double *up)
{
	unsigned int m, n, i, j;
	double **rpp, *rp, *tp, prod = 0.0;
	
	m = MXCOLS(mxp);
	n = MXROWS(mxp);
	tp = CALLOC(m, double);

	rpp = MXRPP(mxp);
	for ( i = 0; i < m; i++ ) {
		for ( j = 0; j < n; j++ ) {
			rp = rpp[j];
			tp[i] += rp[i] *vp[j];
		}
	}
	
	prod = vectors_inner_prod(m, tp, up);
	
	free(tp);
	
	return prod;
}

/*
 *  This is the same routine as matrix_mul_vT_A_u2() to compute
 *  the scalar product of the following formula:
 *
 *      scalar product = v.T * A * u
 *
 *  except that the storage for storing the intermediate vector
 *  that is generated during the calculation was already allocated
 *  upon entry of this routine.  The purpose is to avoid allocating
 *  and freeing dynamic memory repeatedly when this routine is
 *  frequently called.
 *
 */
double  matrix_mul_vT_A_u2(double *vp, struct matrix_dsc_s *mxp, double *up, double *tp)
{
	unsigned int m, n, i, j;
	double **rpp, *rp, prod = 0.0;
	
	m = MXCOLS(mxp);
	n = MXROWS(mxp);
	rpp = MXRPP(mxp);
	for ( i = 0; i < m; i++ ) {
		for ( j = 0; j < n; j++ ) {
			rp = rpp[j];
			tp[i] += rp[i] *vp[j];
		}
	}
	
	prod = vectors_inner_prod(m, tp, up);
	
	return prod;
}

/*
 *  This is not a standard routine to do matrix multiplication.
 *  This is for machine learning only. It computes the following 
 *  formula:
 *
 *      H  =  A + c *( v * u.T)
 *  
 *  where H and A are n x m matrices and v and u are column vectors
 *  and c is a scalar constant.
 *
 */
void matrix_add_A_c_vuT(struct matrix_dsc_s *Hmxp, struct matrix_dsc_s *Amxp, double c, double *vp, double *up)
{
	unsigned int m, n, i, j;
	double **arpp, *arp, **hrpp, *hrp, *tp, u, v, prod = 0.0;

	m = MXCOLS(Amxp);
	n = MXROWS(Amxp);
	arpp = MXRPP(Amxp); hrpp = MXRPP(Hmxp);
	
	for ( i = 0; i < m; i++ ) {
		
		arp = arpp[i];   hrp = hrpp[i];
		v = vp[i];	
		for ( j = 0; j < n; j++ ) {
			hrp[j] = arp[j] + c * v * up[j];
		}
	}
	
}

/*
 *  This is not a standard routine to do matrix multiplication.
 *  This is for machine learning only. It computes the following 
 *  formula:
 *
 *      H1 = H1 + c * A * v * uT * B
 *  
 *  H1, A and B are matrices, v and u are vectors, c is a scalar
 *  constant.  In the implementation, A *v and uT * B are computed
 *  to form a column vector and a row vector, respectively. The
 *  intermediate column and row vectors are used to form a corresonding
 *  matrix via outer product and update the H1 directly element by 
 *  element.  In this way no matrix storage would be allocated. 
 *  
 */
void matrix_H1_sub_c_AvuT_B(struct matrix_dsc_s *H1mxp, double c,
 struct matrix_dsc_s *Amxp, double *vp, double *up, struct matrix_dsc_s *Bmxp)
{
	unsigned int i, j, m, n;
	double *colvecp, *rowvecp, **h1rpp, **arpp, **brpp, *h1rp, *arp, *brp;

	h1rpp = MXRPP(H1mxp);  m = MXROWS(H1mxp);
	 arpp = MXRPP(Amxp);   n = MXCOLS(H1mxp);
     brpp = MXRPP(Bmxp);  
	
	 colvecp = MALLOC(m, double);
	 rowvecp = MALLOC(m, double);
	 // 1) compute the column vector = A * v
	 for ( i = 0; i < m; i++ ) {
	 	 arp = arpp[i];
		 colvecp[i] = vectors_inner_prod(n, arp, vp);
	 }

	 
	 // 2) compute the row vector = uT * B
	for ( i = 0; i < m; i++ ) {
		rowvecp[i] = 0.0;
		for ( j = 0; j < n; j++ ) {
			brp = brpp[j];
			rowvecp[i] += brp[i] * up[j];
		}
	}
	
	//vector_print2(3, colvecp);
	//vector_print2(3, rowvecp);
	
	// 3) Now use the outer product of the column and row vectors
	// to update H1.  Note the matrix of the outer product 
	// is never explicitly formed so no storage is allocated.
	//
	for ( i = 0; i < m; i++ ) {
		h1rp = h1rpp[i];
		for ( j = 0; j < n; j++ ) {
			h1rp[j] += c * colvecp[i] * rowvecp[j];
		}
	}
	
	free(colvecp);
	free(rowvecp);
}


/*
 *  This routine implements the DFP algorithm which is used to update
 *  the H1 matrix in quasi-Newton's method.
 *
 *  Output:  updated H1 matrix
 *  Input:   H matrix, the approximation to the inverse of Hessian matrix
 *           delta vector,  delta = xk1 - xk
 *           gamma vector,  gamma = gk1 - gk
 *
 *  The DFP algorithm
 *  ==================
 *
 *               deltaT * delta         H * gamma * gammaT * H
 *  H1 = H  +  ------------------  -  ---------------------------
 *               deltaT * gamma           gammaT * H * gamma
 *
 *  NB: Both the column vector and row vector are represented by
 *  the (double *) type.  The interpretation of whether it is
 *  a column vector or a row vector is subject to the context and the 
 *  program implementation.
 *
 */

void nlnopt_dfp(struct matrix_dsc_s *H1mxp, struct matrix_dsc_s *Hmxp, double *deltap, double *gammap)
{
	unsigned int i, j, m, n;
	double **h1rpp, **hrpp, *h1rp, *hrp, **rpp, *rp,
	   dotprod_deltaTgamma, prod_gammaT_Hgamma,
	   *colvecp, *rowvecp, a, d, c, r;

	h1rpp = MXRPP(H1mxp);  m = MXROWS(H1mxp);
	 hrpp = MXRPP(Hmxp);   n = MXCOLS(H1mxp);
	
	colvecp = MALLOC(m, double);
	rowvecp = MALLOC(m, double);

	// 1) Compute scalar products:
	//     deltaT * gamma
	//     gammaT * H * gamma
	dotprod_deltaTgamma = vectors_inner_prod(m, deltap, gammap);
    prod_gammaT_Hgamma = matrix_mul_vT_A_u(gammap, Hmxp, gammap);
	

	// 2) Form the column vector of H * gammaT 
	//        and the row vector of gammaT * H   
	for ( i = 0; i < m; i++ ) {
		hrp = hrpp[i];
		colvecp[i] = vectors_inner_prod(m, hrp, gammap);
		rowvecp[i] = 0.0;
		for ( j = 0; j < m; j++ ) {
			hrp = hrpp[j];
			rowvecp[i] += gammap[j] * hrp[i];		
		}
	}
	
	// 3) Update the H1 directly. 
	for ( i = 0; i < m; i++ ) {
		h1rp = h1rpp[i];
		 hrp = hrpp[i];
		d = deltap[i];
		c = colvecp[i];
		for ( j = 0; j < m; j++ ) {
			h1rp[j] =  hrp[j] + d * deltap[j]/dotprod_deltaTgamma
			              -  c * rowvecp[j] / prod_gammaT_Hgamma;
		}
	}

	//matrix_print(H1mxp);

	free(colvecp);
	free(rowvecp);
}



/*
 *  This routine implements the BFGS algorithm which is used to update
 *  the H1 matrix in quasi-Newton's method.
 *
 *  Output:  updated H1 matrix
 *  Input:   H matrix, the approximation to the inverse of Hessian matrix
 *           delta vector,  delta = xk1 - xk
 *           gamma vector,  gamma = gk1 - gk
 *
 *  The BFGS algorithm
 *  ==================
 *
 *                   gammaT * H * gamma      delta * deltaT
 *  H1 = H + ( 1 + ----------------------)* ---------------
 *                     deltaT * gamma        deltaT * gamma
 *
 *
 *            delta * gammaT * H + H * gamma * deltaT
 *       - (-------------------------------------------)
 *                      deltaT * gamma
 *
 *
 *  NB: Both the column vector and row vector are represented by
 *  the (double *) type.  The interpretation of whether it is
 *  a column vector or a row vector is subject to the context and the 
 *  program implementation.
 *
 *
 */
void nlnopt_bfgs(struct matrix_dsc_s *H1mxp, struct matrix_dsc_s *Hmxp, double *deltap, double *gammap)
{
	unsigned int i, j, m, n;
	double **h1rpp, **hrpp, *h1rp, *hrp, **rpp, *rp,
	   dotprod_deltaTgamma, prod_gammaT_Hgamma,
	   *colvecp, *rowvecp, a, d, c, r;

	h1rpp = MXRPP(H1mxp);  m = MXROWS(H1mxp);
	 hrpp = MXRPP(Hmxp);   n = MXCOLS(H1mxp);
	
	colvecp = MALLOC(m, double);
	rowvecp = MALLOC(m, double);

	// 1) Compute scalar products:
	//     deltaT * gamma
	//     gammaT * H * gamma
	dotprod_deltaTgamma = vectors_inner_prod(m, deltap, gammap);
    prod_gammaT_Hgamma = matrix_mul_vT_A_u(gammap, Hmxp, gammap);
	

	// 2) Form the column vector of H * gammaT 
	//        and the row vector of gammaT * H   
	for ( i = 0; i < m; i++ ) {
		hrp = hrpp[i];
		colvecp[i] = vectors_inner_prod(m, hrp, gammap);
		rowvecp[i] = 0.0;
		for ( j = 0; j < m; j++ ) {
			hrp = hrpp[j];
			rowvecp[i] += gammap[j] * hrp[i];		
		}
	}

	a = ( 1.0 + prod_gammaT_Hgamma / dotprod_deltaTgamma) / dotprod_deltaTgamma;

	for ( i = 0; i < m; i++ ) {
		h1rp = h1rpp[i];
		 hrp = hrpp[i];
		d = deltap[i];
		c = colvecp[i];
		for ( j = 0; j < m; j++ ) {
			h1rp[j] =  hrp[j] + a * d * deltap[j] 
			 - (d * rowvecp[j] + c * deltap[j]) / dotprod_deltaTgamma;
		}
	}


	free(colvecp);
	free(rowvecp);
}

/*
 *  Auxillary routine to compute 
 *    
 *     gx = H * dx
 *
 *  where gx and dx are column vectors and H is a matrix.
 *
 */
void nlnopt_mul_H_x(double *gp, double alpha, struct matrix_dsc_s *Hmxp, double *xp)
{
	unsigned int i, j, m, n;
	double **hrpp,  *hrp;
	
	m = MXROWS(Hmxp);    hrpp = MXRPP(Hmxp);
	n = MXCOLS(Hmxp);

	for ( i = 0; i < m; i++ ) {
		hrp = hrpp[i];
		gp[i] = alpha * vectors_inner_prod(m, hrp, xp);
	}

}

/*
 *  Auxillary routine to compute 
 *    
 *     w = u + alpha * v
 *
 *  where w, u and v are column vectors and alpha is a scalar.
 *
 */
void nlnopt_vectors_sum(unsigned int n, double *wp, double *up, double *vp, double alpha)
{
	unsigned int i; 
	for ( i = 0; i < n; i++ ) {
		wp[i] = up[i] + alpha * vp[i];
	}
}

/*
 *   Multiply two matrixes A and B.  The result is stored in A.
 *   The rank of A will be adjusted when necessary.
 *   A = A * B
 *
 */
void matrix_mul_ab(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
    struct matrix_dsc_s *mxcp;
    unsigned int i, j, n, m, l, k;
    double *ap, *bp, *cp, accum;

    n = MXROWS(mxap);
    m = MXCOLS(mxap);

    if ( m != mxbp->num_rows ) {
        sprintf(dbg, " A # cols (%d) != B # rows(%d); do nothing",
            m, mxbp->num_rows); DBG(dbg);
        return;
    }
    
    k = mxbp->num_cols;
    mxcp = matrix_create("temp matrix ", n, k);

    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];    // Move along the rows of A
        cp = mxcp->rowp[i];
        for ( j = 0; j < k; j++ ) {  // Move along the cols of A
            accum = 0.0;
            for ( l = 0; l < m; l++ ) {  // l is column # for A
                bp = mxbp->rowp[l];      // and is row # for B.
                accum += ap[l] * bp[j];        
            }
            //sprintf(dbg, "i,j=%d, %d  accum = %g", i, j, accum); DBG(dbg); 
            cp[j] = accum;
        }
    }

    if ( (n != m) || (mxbp->num_rows != mxbp->num_cols) )
        matrix_adj_a2b(mxap, mxcp);

    matrix_copy_ab(mxap, mxcp);
    
    matrix_dsc_fini(mxcp);
}

/*
 *   Matrix multiplication: A * B 
 *   Return the result in a newly created matrix.
 *
 */
struct matrix_dsc_s *matrix_mul_ab2(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
	char   idbuf[BUF_LEN];
	unsigned int m, n;
    struct matrix_dsc_s *mxcp;

	if ( MXINDIM_EQ(mxap, mxbp) ) {
		if ( 0 ) {
			sprintf(idbuf, "product of (%s) x (%s)", 
				MXID(mxap), MXID(mxbp) ); DBG(dbg); 
		}

		m = MXROWS(mxap);
		n = MXCOLS(mxbp);
		mxcp = matrix_create(idbuf, m, n);
		matrix_mul_abc(mxap, mxbp, mxcp);

	} else {
        sprintf(dbg, " A # cols (%d) != B # rows(%d); do nothing",
           MXCOLS(mxap), MXCOLS(mxbp) ); DBG(dbg);
		mxcp = NULL;
	}


	return mxcp;
}

/*
 *   Multiply two matrixes A and B.  The result is stored in A.
 *   The rank of A will be adjusted when necessary.
 *   
 *   B = A * B , but B will be overwritten.
 *
 *   
 */
void matrix_mul_abb(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
	struct matrix_dsc_s *mxcp;

	mxcp = matrix_create("temp", MXROWS(mxap), MXCOLS(mxbp) );
	matrix_mul_abc(mxap, mxbp, mxcp);
	
	if ( !MXSIZE_EQ(mxcp, mxbp) ) {
		matrix_resize(mxbp, MXROWS(mxcp), MXCOLS(mxcp) );
	}
	matrix_copy_ab(mxbp, mxcp);

	matrix_dsc_fini(mxcp);
}

/*
 *   Multiply two matrixes A and B.  The result is stored in A.
 *   The rank of A will be adjusted when necessary.
 *   
 *   B = A.T * B , but B will be overwritten.
 *   
 */

void matrix_mul_aTbb(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
	struct matrix_dsc_s *mxcp;
	
	mxcp = matrix_create("temp", MXCOLS(mxap), MXCOLS(mxbp) );
	matrix_mul_aTbc(mxap, mxbp, mxcp);
	
	if ( !MXSIZE_EQ(mxcp, mxbp) ) {
		matrix_resize(mxbp, MXROWS(mxcp), MXCOLS(mxcp) );
	}
	matrix_copy_ab(mxbp, mxcp);

	matrix_dsc_fini(mxcp);
}

/*
 *   Multiply two matrixes A and B.  The result is stored in A.
 *   The rank of A will be adjusted when necessary.
 *   
 *   A = A.T * B , but A will be overwritten.
 *   
 */

void matrix_mul_aTba(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
	struct matrix_dsc_s *mxcp;
	
	mxcp = matrix_create("temp", MXCOLS(mxap), MXCOLS(mxbp) );
	matrix_mul_aTbc(mxap, mxbp, mxcp);
	
	if ( !MXSIZE_EQ(mxcp, mxap) ) {
		matrix_resize(mxap, MXROWS(mxcp), MXCOLS(mxcp) );
	}
	matrix_copy_ab(mxap, mxcp);

	matrix_dsc_fini(mxcp);
}


/*
 *  Adjust the ranks of matrix A to those of matrix B.  
 */
void matrix_adj_a2b(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
    unsigned int i, n, m; 

    n = mxbp->num_rows;
    m = mxbp->num_cols;
    
    //sprintf(dbg, " mxap->num_(rows, cols)=(%d, %d)", mxap->num_rows, mxap->num_cols ); DBG(dbg); 
    //sprintf(dbg, " n = %d m = %d", n, m); DBG(dbg); 
    
    if ( mxap->num_rows != n ) {
        if ( mxap->num_rows > n ) { // free up the excessive rowp[...]
            for ( i = n; i < mxap->num_rows;  i++ ) {
                //sprintf(dbg, " freeing rowp[%d]", i); DBG(dbg); 
                free(mxap->rowp[i]);
            }
        }

        mxap->rowp = realloc(mxap->rowp, n * sizeof(double *) );

        if ( mxap->num_rows < n ) {
            // Need to fill the slots [mxap->num_rows ... n-1] for rowps entries of
            // mxap->rowp[...] 
            for ( i = mxap->num_rows; i < n; i++ ) {
                mxap->rowp[i] = calloc( m+1, sizeof(double) );
            }
        }
    }

    if ( mxap->num_cols != m ) {
        for ( i = 0; i < n; i++ ) {
            //sprintf(dbg, " %d djusting column size ", i); DBG(dbg); 
            mxap->rowp[i] = realloc(mxap->rowp[i], m * sizeof(double) );
        }
    }
    
    mxap->num_rows = n;
    mxap->num_cols = m;
}

/*
 *   Copy the data from matrix B to matrix A.  
 */
void matrix_copy_ab(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp )
{
	char id[BUF_LEN];
    unsigned int i, n, m; 
    double *ap, *bp; 

    if ( MXROWS_EQ(mxap, mxbp) && MXCOLS_EQ(mxap, mxbp) ) { 
	
    	n = MXROWS(mxap);
		m = MXCOLS(mxap);
 	    for ( i = 0; i < n; i++ ) {
    		ap = mxap->rowp[i];
       		bp = mxbp->rowp[i];
			memcpy(ap, bp, m * sizeof(double) );
		}

		if ( 0 ) {
			if ( mxap->id )
				free(mxap->id);
			sprintf(id, "copy of %s", MXID(mxbp) );
			mxap->id = strdup(id);
		}
	} else {

		MXSIZE(mxap);   MXSIZE(mxbp);
        sprintf(dbg, "Matrix params do not match. Do nothing"); DBG(dbg); 
        return;
    }

}

/* Check if matrix A is the identity matrix */
unsigned int matrix_is_id(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m, y = 1;
    struct matrix_dsc_s *mxp;
    double *rp;

    mxp = mop->mpp[ mop->ia ];
    n = mxp->num_rows;
    m = mxp->num_cols;
    
    if (  n == m ) {
        for ( i = 0; i < n; i++ ) {
            rp = mxp->rowp[i];
            for ( j = 0; j < m; j++ ) {
                if ( i == j ) {
                    if ( rp[j] != 1.0 ) {
                        y = 0;
                        break;
                    }
                        
                } else {
                    if ( rp[j] != 1.0 ) {
                        y = 0;
                        break;
                    }
                }
            }
            if ( !y )
                break;
        } 
    } else {
        y = 0;
    }

    return y;
}

void matrix_print_str(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m, cols, aug, base0;
    char **rpp, *p, buf[16];

    n = mxp->num_rows;
    m = mxp->num_cols;
    
    aug = 0;   base0 = 0;
    cols = aug ? m+1 : m;

    sprintf(dbg, " Matrix %s: %4d X%4d ", mxp->id, n, m); DBG(dbg); 
    
    strcpy(dbg, "      ");
    for ( i = 0; i < cols; i++ ) {
        if ( base0 )
            sprintf(buf, "    %d  ", i);
        else
            sprintf(buf, "    %d  ", i+1);
        strcat(dbg, buf);
    }
    DBG(dbg); 

    strcpy(dbg,"    +");
    for ( i = 0; i < cols * 3; i++ ) {
        strcat(dbg, "---");
    }
    DBG(dbg); 
    /*** End of printing the boundary marking ***/

    
    for ( i = 0; i < n; i++ ) {
        rpp = mxp->srowppp[i];

        if ( 0 )  // Don't print the row marking
            dbg[0] = '\0';
        else {
            if ( base0 )
                sprintf(dbg, "%3d |", i);
            else
                sprintf(dbg, "%3d |", i+1);
        }

        for ( j = 0; j < cols; j++ ) {
            p = rpp[j];
            sprintf(msg, "%7s", p); 
            strcat(dbg, msg);
        }
        DBG(dbg); 
    }

}

struct matrix_dsc_s *matrix_init_str(char *datafilep)
{
    unsigned int i, j, n, m, num_rows, num_cols;
    char xs[DS_LEN], **pp, *p;
    struct matrix_dsc_s *mxp;
    FILE *fp;
    double *rp;

    fp = fopen(datafilep, "r");

    if ( fp ) {

        fscanf(fp, "%d %d\n", &num_rows, &num_cols);
        
        mxp = calloc(1, sizeof(struct matrix_dsc_s) );
        mxp->id = strdup(datafilep);
        mxp->num_rows = n = num_rows;
        mxp->num_cols = m = num_cols;

        mxp->srowppp = malloc( n * sizeof(char **) );
        for ( i = 0; i < n; i++ ) {
            mxp->srowppp[i] = malloc( m * sizeof(char *) );
        }

        for ( i = 0; i < n; i++ ) {
            // (m+1) is just for augumenting one more column.
            pp = mxp->srowppp[i];
            for ( j = 0; j < m; j++ ) {
                fscanf(fp, "%s", xs );
                p = strdup(xs);
                pp[j] = p;
            }
        }

        fclose(fp);
    } else {
        sprintf(dbg, "Fatal error, could not open file %s : reason = ", datafilep);
        perror((const char *)strerror(errno));
        DBG(dbg); 
        mxp = NULL;
        exit(1);
    }
    
    return mxp;
}

struct matrix_dsc_s *matrix_init(char *datafilep)
{
    unsigned int i, j, n, m, num_rows, num_cols;
    char xs[DS_LEN], fn[DS_LEN];
    struct matrix_dsc_s *mxp;
    FILE *fp;
    double *rp;
    
    /** remove all the possible whitespace chars in the file name. **/
    j = 0;
    for ( i = 0; i < strlen(datafilep); i++ ) {
        if ( !isspace(datafilep[i]) )
            fn[j++] = datafilep[i];
    }
    fn[j] = '\0';
    
    fp = fopen(fn, "r");

    if ( fp ) {

        fscanf(fp, "%d %d\n", &num_rows, &num_cols);

        mxp = calloc(1, sizeof(struct matrix_dsc_s) );
        mxp->id = strdup(datafilep);
        mxp->num_rows = n = num_rows;
        mxp->num_cols = m = num_cols;

        mxp->rowp = malloc( n * sizeof(double *) );

        for ( i = 0; i < n; i++ ) {
            // (m+1) is just for augumenting one more column.
            mxp->rowp[i] = rp = malloc( (m + 1) * sizeof(double) );
            for ( j = 0; j < m; j++ ) {
                fscanf(fp, "%s", xs );
                if ( xs[0] == 'X' || xs[0] == 'x' ) { // This is for automata transition states, 
                    rp[j] = num_rows;            // num_rows means this is a non-existent state.
                } else {
                    rp[j] = strtod(xs, NULL);
                }
            }
        }
    
        fclose(fp);
    } else {
        sprintf(dbg, "Fatal error, could not open file \"%s\":reason= %s",
        datafilep, (const char *)strerror(errno) );
        DBG(dbg); 
        mxp = NULL;
        exit(1);
    }
    
    return mxp;
}

/** --- Check if the line terminated with a newline '\n' char 
 *  --- contains two integers. ---
 */

int matrix_line_2ints(char *linebufp)
{
	regex_t preg;
	char *p, *regex = "^[ \t]*[1-9][0-9]*[ \t]+[1-9][0-9]*[ \t]*\n";
	int   y = 0, cflags, err, eflags;
	size_t nmatch;
	regmatch_t pmatch[10];

	cflags = REG_EXTENDED|REG_NOSUB | REG_NEWLINE;
	err = regcomp(&preg, regex, cflags);
	p = linebufp;

	if ( !err ) {
		eflags = REG_NOTEOL;
		
		err = regexec(&preg, p, sizeof(pmatch), pmatch, eflags);
		if ( err ==  REG_NOMATCH ) {
			if ( 0 ) {
				sprintf(dbg, "string \"%s\" does not match the pattern.\n", p);
				DBG(dbg);
			}
		} else if ( err ) {
			sprintf(dbg, "regexec() error   err = %d \n",  err);
			DBG(dbg);
		} else { // matched
			y = 1;
		}
		regfree(&preg);

	} else {
		sprintf(dbg, "regcomp() error err = %d \n",  err);
		DBG(dbg);
	}

	return y;
}

unsigned int matrix_num_cols_per_row(char *linebufp)
{
	unsigned int num_cols;
    char *p, *ctx, *s2 = " \t,\r\n";

	num_cols = 1;
	p = strdup(linebufp);

	p = strtok_r(p, s2, &ctx);  
	while( p = strtok_r(NULL, s2, &ctx) ) {
		num_cols++;
		if ( 0 ) {
			sprintf(dbg, " p = %s", p); DBG(dbg); 
		}
	}
	free(p);

	return num_cols;
}

double *matrix_one_rp(unsigned int n, char *dp)
{
	unsigned int num_cols;
    double *rp;
	char *p, *ctx, *s2 = " \t,\r\n";

	rp = MALLOC(n, double);
	num_cols = 0;
	p = strtok_r(dp, s2, &ctx);  
	rp[num_cols++] = strtod(p, NULL);

	while( p = strtok_r(NULL, s2, &ctx) ) {
		rp[num_cols++] = strtod(p, NULL);
		//sprintf(dbg, " p = %s", p); DBG(dbg); 
	}
	
	if ( n != num_cols ) {
		sprintf(dbg, "Warning no. of columns mismatched n = %d, counted cols = %d", n, num_cols);
		DBG(dbg); 
	}

	return rp;
}

/** Load a data line in (char *), converted into double, into a double *) buffer **/
void  matrix_load_one_row(unsigned int n, double *rp, char *dp)
{
	unsigned int num_cols;
	char *p, *ctx, *s2 = " \t,\r\n";

	num_cols = 0;
	p = strtok_r(dp, s2, &ctx);  
	rp[num_cols++] = strtod(p, NULL);

	while( p = strtok_r(NULL, s2, &ctx) ) {
		rp[num_cols++] = strtod(p, NULL);
		//sprintf(dbg, " p = %s", p); DBG(dbg); 
	}
	
	if ( n != num_cols ) {
		sprintf(dbg, "Warning no. of columns mismatched n = %d, counted cols = %d", n, num_cols);
		DBG(dbg); 
	}

}

/** Given a white-space separated data line, parse it to a matrix row, the 
 *  first element in this row is the number of data items in this row.
 *  
 *  So, if the dp points to a string 
 *
 *    "1 2 3 4 5\n"
 *    
 *  then
 *    rp will be
 *
 *    5  1  2  3  4  5
 * 
 * There are total 6 numerical values and the first items says
 * there are 5 values following it, which come from the original
 * data string.
 *
 */

double *rp_per_row(char *dp)
{
	unsigned int n, num_cols;
    double *rp;
	char *p, *ctx, *ctx2, *s2 = " \t,\r\n";

	n  = 1;
	rp = MALLOC(n+1, double);
	p  = strtok_r(dp, s2, &ctx);  
	rp[n++] = strtod(p, NULL);
	
	while( p = strtok_r(NULL, s2, &ctx) ) {
		rp = realloc(rp, (n+1)* sizeof(double) );
		rp[n++] = strtod(p, NULL);
		//sprintf(dbg, " p = %s", p); DBG(dbg); 
	}

	rp[0] = (double)n-1;

	return rp;
}

// The same as rp_per_row(char *dp), execept the data are in unsigned int.
unsigned  int *ip_per_row(char *dp)
{
	unsigned int n, num_cols, *ip;
	char *p, *ctx, *ctx2, *s2 = " \t,\r\n";

	n  = 1;
	ip = MALLOC(n+1, unsigned int );
	p  = strtok_r(dp, s2, &ctx);  
	ip[n++] = (unsigned int) strtol(p, NULL, 10);
	
	while( p = strtok_r(NULL, s2, &ctx) ) {
		ip = realloc(ip, (n+1)* sizeof(unsigned int) );
		ip[n++] = (unsigned int) strtol(p, NULL, 10);
		//sprintf(dbg, " p = %s", p); DBG(dbg); 
	}

	ip[0] = (double)n-1;

	return ip;
}


/** --------------------- Matrix reshape -------------------------**/
struct matrix_dsc_s *matrix_reshape(struct matrix_dsc_s *inmxp, unsigned int a, unsigned int b)
{
	unsigned int i, j, m, n, inrows, incols;
	struct matrix_dsc_s *mxp;
	char id[DS_LEN];

	inrows = MXROWS(inmxp);
	incols = MXROWS(inmxp);

	sprintf(id, "reshape of %s (%dx%d)", inmxp->id, inrows, incols);
	mxp = matrix_create(id, a, b);


	return mxp;
}


//inline void matrix_set_no_rp(struct matrix_dsc_s *mxp)
void matrix_set_no_rp(struct matrix_dsc_s *mxp)
{
	mxp->no_rp = 1;
}

/*
 *  Create a virtual (shadow) matrix which points to a slice of row of 
 *  bigger matrix (in terms of number of rows).  Actual data
 *  storage will not be created.
 */


struct matrix_dsc_s *matrix_create_vs(struct matrix_dsc_s *inmxp,
		unsigned int row_lo, unsigned int row_hi)
{
	unsigned int i, j, m, n, num_rows, num_cols;
	struct matrix_dsc_s *mxp;
	char id[DS_LEN];
	
	n = row_hi - row_lo + 1;
	num_rows = MXROWS(inmxp);
	num_cols = MXCOLS(inmxp);
	if ( num_rows >= n ) {
		sprintf(id, "virtual slice of %s", MXID(inmxp)); DBG(dbg); 
	
		mxp = matrix_create_rowp_only(id, n, num_cols);

		for ( i = 0; i < n; i++ ) {
			mxp->rowp[i] = inmxp->rowp[i + row_lo];
		}
	} else {
		sprintf(dbg, "Parameters error: inmxp rows = %d, the requested row range is larger [%d:%d) (%d rows)",
		num_rows, row_lo, row_hi, n); DBG(dbg); 	
		mxp = NULL;
	}

	return mxp;
}

void matrix_fill_vs(struct matrix_dsc_s *inmxp, struct matrix_dsc_s *vsmxp,
                    unsigned int row_lo, unsigned int row_hi)
{
	unsigned int i, j, m, n, num_rows, num_cols;
	struct matrix_dsc_s *mxp = vsmxp;
	char id[DS_LEN];
	
	n = row_hi - row_lo + 1;
	num_rows = MXROWS(inmxp);
	num_cols = MXCOLS(inmxp);
	if ( num_rows >= n ) {
		mxp->num_rows = n;
		mxp->num_cols = num_cols;
		mxp->rowp = realloc(mxp->rowp, n * sizeof(double *));
		for ( i = 0; i < n; i++ ) {
			mxp->rowp[i] = inmxp->rowp[i + row_lo];
		}
		matrix_set_no_rp(mxp);
	} else {
		sprintf(dbg, "Parameters error: inmxp rows = %d, the requested row range is larger [%d:%d) (%d rows)",
		num_rows, row_lo, row_hi, n); DBG(dbg); 	
	}
}

/** Create a matrix dsc with the storage of data, i.e., only those with id and row
 * pointer storage **/
struct matrix_dsc_s *matrix_create_rowp_only(char *id, unsigned int num_rows, unsigned int num_cols)
{
	struct matrix_dsc_s *mxp;

	mxp = calloc(1, sizeof(struct matrix_dsc_s) );
	if ( id ) 
		mxp->id = strdup(id);
	else
		mxp->id = strdup("(unknonwn id)");

	mxp->num_rows = num_rows;
	mxp->num_cols = num_cols;
	mxp->rowp = MALLOC(num_rows, double *);

	matrix_set_no_rp(mxp);  // set the no storage flag

	return mxp;
}

/**----- Switch rows of a matrix by a row index list -----**/
void matrix_sw_rows(struct matrix_dsc_s *mxp, unsigned int *rip, unsigned int num_ris)
{
	unsigned int i, j, m, n, num_rows, num_cols;
	double *p;
	
	n = num_ris;
	num_rows = MXROWS(mxp);
	num_cols = MXCOLS(mxp);
	if ( num_rows >= n ) {
		m = n/2;
		for ( i = 0; i < m; i++ ) {
			p = mxp->rowp[i];
			mxp->rowp[i] = mxp->rowp[n - i-1];
			mxp->rowp[n - i-1] = p;
		}
	} else {
		sprintf(dbg, "Parameters error: mxp rows = %d, requested %d rows to be swapped.",
		num_rows, n); DBG(dbg); 	
	}
}



/*
 *  Create a series of special matrixes.  They are speical in that
 *  the rows of the these matrixes are in continuous memory addresses
 *  and they can still be operated on as normal matrixes.
 *  
 *  Note: compare to other matrix_arrays(), this adds a matrix for
 *   bias components in each layer.
 *
 *  g_flag specifies if the created matrixes are gradient decent.
 *     0:  means normal weight and bias matrixes
 *     1:  means gradient matrixes for weights and biases 
 *
 */
unsigned int matrix_arrays(struct matrix_op_s *mop, double *dp, unsigned int a[], unsigned int num_mxps, unsigned int g_flag)
{
	unsigned int i, j, k, m, n, rn, cn, c, mx_start_index;
	struct matrix_dsc_s *mxp, *bias_mxp;
	double **rpp, *rp, *p, **bias_rpp;
	char wt_id[BUF_LEN], bias_id[BUF_LEN];

	m = num_mxps;  c = 0;  p = dp;
	mx_start_index = get_num_matrixes(mop);

	for ( k = 0; k < m; k++ ) {
		rn = a[2*k];
		cn = a[2*k+1];
		sprintf(wt_id, "weights %s connecting layers %d -- %d",
			g_flag ? "grad" : "", k, k+1); 
		sprintf(bias_id, "bias %s for wts cxning layers %d -- %d", 
			g_flag ? "grad" : "", k, k+1); 
		mxp = matrix_create_rowp_only(wt_id, rn, cn);
		rpp = MXRPP(mxp);
		
		for ( i = 0; i < rn; i++ ) {
			rpp[i] = p;
			p += cn;
		}
		
		bias_mxp = matrix_create_rowp_only(bias_id, 1, cn);
		bias_rpp = MXRPP(bias_mxp);  // Bias matrix is a one-row matrix.
		bias_rpp[0] = p;
		p += cn;
		
		if ( 0 ) {
			matrix_print(mxp);
			matrix_print(bias_mxp);
		}

		if ( mop ) {
			matrix_add_mxp(mop, mxp);
			matrix_add_mxp(mop, bias_mxp);
		} else {
			matrix_dsc_fini(mxp);
			matrix_dsc_fini(bias_mxp);
			mx_start_index = 0;
		}
	}
	
	return mx_start_index;
}


/*
 *   Create a matrix struct only. The actual storage is the specified
 *   by p.  After the storage allocated in p, the new avaiable
 *   storage is pointed to by (*newp). 
 *
 */
struct matrix_dsc_s *matrix_create_rowp(struct matrix_op_s *mop, char *id, double *p, double **newp, unsigned int num_rows, unsigned int num_cols)
{
	unsigned int i, j, m, n;
	struct matrix_dsc_s *mxp;
	double *oldp = p;

	m = num_rows,  n = num_cols;
	mxp =  matrix_create_rowp_only(id, m, n);
	
	for ( i = 0; i < m; i++ ) {
		mxp->rowp[i] = oldp;
		oldp += n;
	}
	

	if ( mop ) {
		matrix_add_mxp(mop, mxp);
	}

	if ( newp ) {
		*newp = oldp;
	}

	return mxp;
}


struct matrix_dsc_s *matrixes_row_mxp(struct matrix_op_s *mop, unsigned int idx, unsigned int n)
{
	unsigned int k, i, j, m, sz, rn, cn;
	struct matrix_dsc_s *mxp, *rmxp;
	double *rp, **rpp, *p;

	sz = 0;
	for ( i = 0; i < n; i++ ) {
		mxp = MOP2MXP(mop, idx+i);
		sz += mxsize(mxp); 
	}
	//sprintf(dbg, "%d matrixes, sz = %d", n, sz); DBG(dbg); 
	
	rmxp = matrix_create2(mop, "row vector", 1, sz);
	rpp = MXRPP(rmxp);
	p = rpp[0];

	for ( k = 0; k < n; k++ ) {
		mxp = MOP2MXP(mop, idx+k);
		rpp = MXRPP(mxp);
		rn = MXROWS(mxp);
		cn = MXCOLS(mxp);
		for ( i = 0; i < rn; i++ ) {
			rp = rpp[i];
			for ( j = 0; j < cn; j++ ) {
				*p++ = rp[j];	
			}
		}
	}

	return rmxp;
}

/*
 * The same as matrix_init(), except this version allows comment line in the input data file **/
/*
 * In order to be compatible with the old data file for specifying
 * the matrices.  It is still accepting that the first data line can
 * specify the numbers of rows and columns.
 *
 *                    But this can be ambiguous
 *    11  9             1 2
 *    ...               2 2
 *    ...
 *    ...
 *
 */
struct matrix_dsc_s *matrix_init2(char *datafilep)
{
    unsigned int i, j, n, m, num_rows, num_cols, y_2ints, dl, ln, fl_amb;
    char c, fn[DS_LEN], xs[DS_LEN], buf[DS_LEN], *p, *ctx, *s2 = " \t,\r\n"; 
    FILE *fp;
	double *rp, *rp2;
	long  fp_off;
    struct matrix_dsc_s *mxp;
    
	dl = 0, ln = 0, fl_amb = 0;
	num_rows = num_cols = 0;
    
	/** remove all the whitespace chars in the file name. **/
    j = 0;
    for ( i = 0; i < strlen(datafilep); i++ ) {
        if ( !isspace(datafilep[i]) )
            fn[j++] = datafilep[i];
    }
    fn[j] = '\0';
    
	fp = fopen(fn, "r");
	if (!fp ) {
        sprintf(dbg, "Fatal error, could not open file \"%s\" : reason = %s",
        datafilep, (const char *)strerror(errno) );
        DBG(dbg); 
        mxp = NULL;
        exit(1);
    }
    
	rp2 = NULL;
	// ---  Read the first data line to identify how many columns there are in a line. ---
    while ( fgets(buf, sizeof(buf), fp ) ) {
		ln++;
		c = buf[0];
        if ( !( c == '\n' || c == '#' || c == '%' || c == '!' ) ) {
			
			y_2ints = matrix_line_2ints(buf);

			if ( y_2ints ) {
				// This is inherently ambiguous.
				p = strtok_r(buf,  s2, &ctx);   m = atoi(p);
				p = strtok_r(NULL, s2, &ctx);   n = atoi(p);
				if ( n == 2 ) {
				//sprintf(dbg, " m = %d  p = %s n = %d", m, p, n ); DBG(dbg); 
				// This case is ambiguous because this can be interpreted as:
				// 1.  There are 2 columns in this data file, or
				// 2.  This is just a datum in this second column.
				// can only resolve this after reading the whole data file.
				// Assume this is a data line, create a row for it right now.
			    	rp = MALLOC(2, double);
					rp[0] = m;    num_rows = m;  
					rp[1] = n;    num_cols = n;
					fl_amb = 1;   dl = 1;
				} else {
					// sprintf(dbg, "First data line is ambiguous = %d\
					// save it as data line first", ); DBG(dbg); 
					// By this step, there are two cases:
					// 1) This is a data line, not specifying rows and cols
					// 2) This is a line specifying the rows and cols
					//  (the second number)
					// This ambiguity can be resolved by reading the next data line
					// See how many columns in the data line. Create a buffer for now.
			    	rp = MALLOC(2, double);
					rp[0] = m;    num_rows = m;  
					rp[1] = n;    num_cols = n;
				}
			} else {
				// Check how many columns there are.
				dl = 1;
				num_cols = matrix_num_cols_per_row(buf);
				//sprintf(dbg, " num_cols = %d", num_cols); DBG(dbg); 
			}
			break;  // Assume we got the first data line
		}
	}

	//-------------------------------------------------------
	if ( !dl ) 
    	while ( fgets(buf, sizeof(buf), fp ) ) {
			ln++;
			c = buf[0];
        	if ( !(c == '\n' || c == '#' || c == '%' || c == '!' ) ) {
				// for those file specified the row and col number,
				// read a data line too.
				
				num_cols = matrix_num_cols_per_row(buf);
				//sprintf(dbg, " num_cols = %d n = %d", num_cols, n); DBG(dbg); 
				if ( num_cols != n ) {
					rp2 = rp;
				}
				break;
			}
		}

	//sprintf(dbg, "buf = %s", buf); DBG(dbg); 
	if ( !fl_amb )
		rp = matrix_one_rp(num_cols, buf);

	m = 0;
    mxp = CALLOC(1, struct matrix_dsc_s );
    mxp->id = strdup(datafilep);
    mxp->rowp = MALLOC(1, double *);
	if ( rp2 ) {
		mxp->rowp[m++] = rp2;
	}
	mxp->rowp[m++] = rp;

	while ( fgets(buf, sizeof(buf), fp ) ) {
		ln++;
		c = buf[0];
        if ( !(c == '\n' || c == '#' || c == '%' || c == '!' ) ) {
            // (num_cols+1) is just for augumenting one more column.
        	mxp->rowp = realloc(mxp->rowp, (m+1) * sizeof(double *) );		
            rp = matrix_one_rp(num_cols, buf);
			mxp->rowp[m++] = rp;
        }
	}

	fclose(fp);
	
	/**----- Check if the first non-commentary line specified no. of rows or simply a
	 * data line -----**/
	if ( fl_amb ) {
		//sprintf(dbg, "ambiguous checking ... ---> num_rows = %d m = %d", num_rows, m); DBG(dbg); 
		if ( (num_rows + 1) == m ) { // If the first datum on the first 
			// non-commentary line is one less that the no. of rows read,
			// This is used by the old-style to specify the no. of rows and column. 
			// Remove this data rows.
			rp = mxp->rowp[0]; 
			free(rp);
			for ( i = 1; i < m; i++ ) {
				mxp->rowp[i-1] = mxp->rowp[i];
			}
			m--;
			mxp->rowp = realloc(mxp->rowp, m * sizeof(double*));
		}
	}
    num_rows = m;  
	
	if ( 0 ) {
		sprintf(dbg, " num_(rows, cols)=(%d, %d)", 
		num_rows, num_cols); DBG(dbg); 
	}

	mxp->num_rows = num_rows;
    mxp->num_cols = num_cols;
    
	return mxp;
}


void matrix_dsc_fini(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m;
    double *rp;
    char ***ppp, **pp, *p;
    struct matrix_data_s **mdpp, *mdp;

    n = mxp->num_rows;
    m = mxp->num_cols;
    ppp = mxp->srowppp;

    if ( ppp ) {
        for ( i = 0; i < n; i++ ) {
            pp = mxp->srowppp[i];
            for ( j = 0; j < m; j++ ) {
                p = pp[j];
                free(p);
            }
            free(pp);
        }
        free(ppp);
    
    } else {
		
		if ( !mxp->no_rp ) {
        	for ( i = 0; i < n; i++ ) {
           		rp = mxp->rowp[i];
            	free(rp);
			}
		}
        free(mxp->rowp);
        free(mxp->id);
    }

    if ( mxp->m ) free(mxp->mp);
    if ( mxp->n ) free(mxp->np);
}

void matrix_op_fini(struct matrix_op_s *mop)
{
    unsigned int i, n; 
    struct matrix_dsc_s *mxp;

    n = mop->num_matrixes;
	
	if ( 0 ) {
		matrix_op_list(mop);
    	sprintf(dbg, "No. matrixes in mop{} = %d", n);
		DBG(dbg); 
    }

	for ( i = 0; i < n; i++ ) {
        mxp = mop->mpp[i];
		if ( mxp ) {
        	 //sprintf(dbg, "freeing = %d matrix %s (%d, %d)", i, mxp->id, MXROWS(mxp), MXCOLS(mxp)); DBG(dbg); 
        	matrix_dsc_fini(mxp);
        	free(mxp);
		}
    }

    free(mop->mpp);    
    free(mop);
}



void matrix_op_print(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *mxp;
    unsigned int i, n;

    n = mop->num_matrixes;
    for ( i = 0; i < n; i++ ) {
        mxp = mop->mpp[i];
        if ( mop->str )
            matrix_print_str(mxp);
        else
            matrix_print(mxp);
    }
}

void matrix_op_list(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *mxp;
    unsigned int i, n;

    n = mop->num_matrixes;
    for ( i = 0; i < n; i++ ) {
        mxp = mop->mpp[i];
        MXSIZE2(i, mxp);
    }
}

void matrix_write_file(char *fname, struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m, cols, aug;
    double **rpp, *rp;
    FILE *fp;

	m = MXROWS(mxp), n = MXCOLS(mxp), rpp = MXRPP(mxp);
    fp = fopen(fname, "w");

    for ( i = 0; i < m; i++ ) {
        rp = rpp[i];
        for ( j = 0; j < n; j++ ) {
            fprintf(fp, "%8.7g ", rp[j]);
        }
        fprintf(fp, "\n");
    }
    
    fclose(fp);
}

/*
 * Roughly the same as matrix_init2() except that data are in binary.
 * The format is
 *    the first two binary numbers are two unsigned integers telling
 *    the row number and column number with the rest being the data for
 *    the matrix in double type.
 *
 */
struct matrix_dsc_s *matrix_init_bin(struct matrix_op_s *mop, char *datafilep)
{
	unsigned int  i, j, m, n;
	double **rpp, *rp;
	struct matrix_dsc_s *mxp;
	FILE *fp;

	fp = fopen(datafilep, "r");

	if ( fp ) {
		// -- Read  the # of rows and columns -- 
		fread(&m, sizeof(unsigned int), 1, fp);
		fread(&n, sizeof(unsigned int), 1, fp);
	
		if ( mop ) {
			mxp = matrix_create2(mop, datafilep, m, n);
		} else {
			mxp = matrix_create(datafilep, m, n);
		}

		rpp = MXRPP(mxp);
		for ( i = 0; i < m; i++ ) {
			rp = rpp[i];
			fread(rp, sizeof(double), n, fp);
		}

		fclose(fp);
	} else {
        sprintf(dbg, "Fatal error, could not open file \"%s\": reason = %s",
        datafilep, (const char *)strerror(errno) );
        DBG(dbg); 
	
	}

	return mxp;
}

/** Load the data in a matrix_dsc_s {} into the specified datafile. **/
void matrix_save_binfile(char *datafilep, struct matrix_dsc_s *mxp)
{
	unsigned int  i, j, m, n;
	double d, t, **rpp, *rp;
	FILE *fp;

	m = MXROWS(mxp), n = MXCOLS(mxp), rpp = MXRPP(mxp);
	fp = fopen(datafilep, "w");
	
	if ( fp ) {
		// -- Write the # of rows and columns -- 
		fwrite(&m, sizeof(unsigned int), 1, fp);
		fwrite(&n, sizeof(unsigned int), 1, fp);
		
		for ( i = 0; i < m; i++ ) {
			rp = rpp[i];
			// -- Write the data row by row --
			fwrite(rp, sizeof(double), n, fp);
		}
		
		fclose(fp);
	} else {
	
        sprintf(dbg, "Fatal error, could not open file \"%s\": reason = %s",
        datafilep, (const char *)strerror(errno) );
        DBG(dbg); 
	}
}

/** Load a matrix_dsc_s {} from the specified datafile. **/
void matrix_load_binfile(char *datafilep, struct matrix_dsc_s *mxp)
{
	unsigned int  i, j, m, n;
	double d, t, **rpp, *rp;
	FILE *fp;

	m = MXROWS(mxp),  n = MXCOLS(mxp),  rpp = MXRPP(mxp);
	fp = fopen(datafilep, "r");
	
	if ( fp ) {
		// -- Read  the # of rows and columns -- 
		fread(&m, sizeof(unsigned int), 1, fp);
		fread(&n, sizeof(unsigned int), 1, fp);
		
		if ( (m == MXROWS(mxp)) && (n == MXCOLS(mxp))) {
			for ( i = 0; i < m; i++ ) {
				rp = rpp[i];
				// -- Read the data row by row --
				fread(rp, sizeof(double), n, fp);
			}
		} else {
			sprintf(dbg, "dims not match: mxp (%d, %d) while datafile says(%d, %d)", 
				MXROWS(mxp), MXCOLS(mxp), m, n); DBG(dbg);			
		}
		
		fclose(fp);
	} else {
	
        sprintf(dbg, "Fatal error, could not open file \"%s\": reason = %s",
        datafilep, (const char *)strerror(errno) );
        DBG(dbg); 
	}
}


void matrix_print(struct matrix_dsc_s *mxp)
{
	if ( mxp->chr ) 
		matrix_diagram_print(mxp);
	else
		matrix_printm(mxp, 0, MXROWS(mxp));
}


/** Print the [ri_lo: ri_hi-1] rows of matrix mxp **/
void matrix_printm(struct matrix_dsc_s *mxp, unsigned int ri_lo, unsigned int ri_hi)
{
    unsigned int i, j, n, m, cols, aug, flag_str, width;
	char buf[BUF_LEN], print_dsc[BUF_LEN];
	char ***strppp, **strpp, *strp;
    double **rpp, *rp;

	width = 16;
    n = MXROWS(mxp); 
    m = MXCOLS(mxp); 
    //aug = 1;
    aug = mxp->aug;
    cols = aug ? m+1 : m;

    sprintf(dbg, "Matrix %s: %4d x %-4d", mxp->id, n, m); DBG(dbg); 
    
    /***-- Just print the boundary marking --***/
    
    strcpy(dbg,"      |");
    for ( i = 0; i < cols; i++ ) {
        //sprintf(msg, "%*d%*s", width/2, i, width/2+1," "); 
        pr_center(msg, i, width);
		strcat(dbg, msg);
    }
    strcat(dbg, "  |");
    DBG(dbg); 

    strcpy(dbg,"  ----+");
    for ( i = 0; i < width*m; i++ ) {
        strcat(dbg, "-");
    }
    strcat(dbg, "--+");
    DBG(dbg); 
    /*** End of printing the boundary marking ***/
	
	
	flag_str = mxp->srowppp ? 1 : 0;
	if ( flag_str ) {
		strppp = MXRPPSTR(mxp);
	} else {
		rpp = MXRPP(mxp);
	}
    
	if ( flag_str ) { // string version
	
		for ( i = ri_lo; i < ri_hi; i++ ) {
        	strpp = strppp[i];
        
			if ( 0 )  // Don't print the row marking
        	    dbg[0] = '\0';
        	else
            	sprintf(dbg, "%5d |", i);
        
       		for ( j = 0; j < cols; j++ ) {
            	sprintf(msg, " %12s ", strpp[j]); 
            	strcat(dbg, msg);
       		}
       		DBG(dbg); 
    	}

	} else {  // numerical version
		
		for ( i = ri_lo; i < ri_hi; i++ ) {
        	rp = rpp[i];
        
			if ( 0 )  // Don't print the row marking
        	    dbg[0] = '\0';
        	else
            	sprintf(dbg, "%5d |", i);
        
       		for ( j = 0; j < cols; j++ ) {
            	sprintf(msg, "%' '-*.10g", width, rp[j]); 
            	strcat(dbg, msg);
       		}
			strcat(dbg, "  |");
       		DBG(dbg); 
    	}
    	
		strcpy(dbg,"  ----+");
    	for ( i = 0; i < width*m; i++ ) {
        	strcat(dbg, "-");
    	}
    	strcat(dbg, "--+");
    	DBG(dbg); 
	}
}


/** Construct a string with the integer k centered given the "width"  **/
void pr_center(char *id, int k, int width)
{
	int i, n, len, sw,  swidth;
	char buf[1024];  // Hope this is large enough.

	sprintf(buf, "%d", k);
	len = strlen(buf);
	
	swidth = width - len;
	sw = swidth / 2;
	sprintf(id, "%*s%d%*s", sw+ (swidth% 2 ? 1:0 ), " ", k, sw, " ");
}

void pr_char_center(char *id, int k, int width)
{
	int i, n, len, sw,  swidth;
	char buf[1024];  // Hope this is large enough.

	sprintf(buf, "%c", k);
	len = strlen(buf);
	
	swidth = width - len;
	sw = swidth / 2;
	sprintf(id, "%*s%c%*s", sw+ (swidth% 2 ? 1:0 ), " ", k, sw, " ");
}

/** Print the [ci_lo: ci_hi-1] cols of matrix mxp **/
void matrix_printn(struct matrix_dsc_s *mxp, unsigned int ci_lo, unsigned int ci_hi)
{
    unsigned int i, j, n, m, cols, aug;
    double *rp;

    n = mxp->num_rows;
    m = mxp->num_cols;
    
    //aug = 1;
    aug = mxp->aug;
    cols = aug ? m+1 : m;

    sprintf(dbg, " Matrix %s: %4d X%4d ", mxp->id, n, m); DBG(dbg); 
    
    /***-- Just print the boundary marking --***/
    
    strcpy(dbg,"   ");
    for ( i = ci_lo; i < ci_hi; i++ ) {
        sprintf(msg, "%13d ", i); 
        strcat(dbg, msg);
    }
    DBG(dbg); 

    strcpy(dbg,"    +");
    for ( i = ci_lo; i < ci_hi * 3; i++ ) {
        strcat(dbg, "---");
    }
    DBG(dbg); 
    
    /*** End of printing the boundary marking ***/

    for ( i = 0; i < n; i++ ) {
        rp = mxp->rowp[i];

        if ( 0 )  // Don't print the row marking
            dbg[0] = '\0';
        else
            sprintf(dbg, "%3d |", i);
        
        for ( j = ci_lo; j < ci_hi; j++ ) {
            //sprintf(msg, "%13.11g ", rp[j]); 
            sprintf(msg, " %8g ", rp[j]); 
            strcat(dbg, msg);
        }
        DBG(dbg); 
    }

}

struct matrix_dsc_s *matrix_diagram_create(char *id, unsigned int m, unsigned int n)
{
	struct matrix_dsc_s *mxp = matrix_create(id, m, n);
	
	mxp->chr = 1;
	
	return mxp;
}

void  matrix_diagram_print(struct matrix_dsc_s *mxp)
{
	unsigned int i, j, m, n, t, width;
	double **rpp, *rp;

	m = MXROWS(mxp);  width = 5;
	n = MXCOLS(mxp);  rpp = MXRPP(mxp);
	
    sprintf(dbg, "Char diagram %s: %4d x %-4d", mxp->id, m, n); DBG(dbg); 
    /***-- Just print the boundary marking --***/
    
    strcpy(dbg,"      |");
    for ( i = 0; i < m; i++ ) {
        pr_center(msg, i, width);
		strcat(dbg, msg);
    }
    strcat(dbg, "  |");
    DBG(dbg); 

    strcpy(dbg,"  ----+");
    for ( i = 0; i < width*m; i++ ) {
        strcat(dbg, "-");
    }
    strcat(dbg, "--+");
    DBG(dbg); 
    /*** End of printing the boundary marking ***/
	
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		sprintf(dbg, "%5d |", i);
		for ( j = 0; j < n; j++ ) {
			t = (unsigned int)(rp[j]);
			if ( t ) 
				pr_char_center(msg, 'X', width);
			else
				pr_char_center(msg, ' ', width);
			strcat(dbg, msg);
		}
		strcat(dbg, "  |");
		DBG(dbg);
	}
	
	strcpy(dbg,"  ----+");
   	for ( i = 0; i < width*m; i++ ) {
       	strcat(dbg, "-");
   	}
   	strcat(dbg, "--+");
   	DBG(dbg); 
}

/** matrix op: multiply **/
void matrix_op_mul(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *mxap, *mxbp;

    mxap = mop->mpp[ mop->ia ];
    mxbp = mop->mpp[ mop->ib ];

    //mxap = mop->mpp[ mop->il ];
    //mxbp = mop->mpp[ mop->iu ];

    matrix_mul_ab(mxap, mxbp);  // Result is stored in mxap.
}

void matrix_op_mul2(struct matrix_op_s *mop)
{
    unsigned int i, j, k, l, n, m;
    struct matrix_dsc_s *mxap, *mxbp, *mxp;
    double *ap, *bp, *cp;

    mxap = mop->mpp[ mop->ia ];
    mxbp = mop->mpp[ mop->ib ];

    k = mxap->num_cols;
    if (  k == mxbp->num_rows ) {
        n = mxap->num_rows;
        m = mxbp->num_cols;
        
        // Expand the pointer array storage to accommodate 
        // the new (struct matrix_dsc_s *)
        mop->mpp = realloc( mop->mpp, (mop->num_matrixes + 1) * sizeof(struct matrix_dsc_s *) );

        // --- create the new matrix to hold the result ---
        mop->mpp[ mop->num_matrixes++ ] = mxp =
          calloc(1, sizeof(struct matrix_dsc_s));
        mxp->id = strdup("matrix mul 1 x 2 ");
        mxp->num_rows = n;
        mxp->num_cols = m;
        mxp->rowp = malloc( n * sizeof(double *) );

        for ( i = 0; i < n; i++ ) {
            mxp->rowp[i] = cp = calloc(m, sizeof(double) );
            ap = mxap->rowp[i]; 

            for ( j = 0; j < m; j++ ) {

                for ( l = 0; l < k; l++ ) {
                    bp = mxbp->rowp[l]; 
                    /**  This is actually a(i, j) x b(j, l) **/
                    cp[j] += ap[l] * bp[j];
                    /** and, store the result c(i, j) **/
                }
            }
        }

    } else {
        sprintf(dbg, "Fatal error: requested matrix op multiplication, but matrix A column number (%d) is not equal to matrix B row number (%d)...", k, mxbp->num_rows );   DBG(dbg); 
    }
    
}

/** 
 * Check if the matrices have the same number of rows and columns for
 * addition/subtraction or element-wise operations. 
 *
 **/
unsigned int matrix_same_addsub(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp)
{
    unsigned int i, j, l, m, n;
    
    m = MXROWS(mxap); i = MXROWS(mxbp);
    n = MXCOLS(mxap); j = MXCOLS(mxbp);
    if (  (m == i) && (n == j) ) 
        l = 1;
    else
        l = 0;

    return l;
}

/**
 *
 * Check if the two matrices have same element values.
 * Of course, the numbers of rows and columns of the two
 * matrices are equal.  Otherwise, they are not equal by 
 * default.
 *
 **/
unsigned int matrix_equal(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp)
{
    unsigned int i, j, l, m, n, mc, eq = 1;
    double **rpp, *rp, **r2pp, *r2p;
    
    mc = MXROWS_EQ(mxap, mxbp) && MXROWS_EQ(mxap, mxbp);
    if ( mc ) {
        m = MXROWS(mxap); 
        n = MXCOLS(mxap);
        rpp = MXRPP(mxap);
        r2pp = MXRPP(mxbp);
        for ( i = 0; i < m; i++ ) {
            rp = rpp[i];
            r2p = r2pp[i];
            for ( j = 0; j < n; j++ ) {
                if ( rp[j] != r2p[j]) {
                    eq = 0;
                    break;
                }
            }
        }
    } else
        eq = 0;

    return eq;
}

unsigned int matrix_rc_index_valid( struct matrix_dsc_s *mxp, unsigned int ri, unsigned int ci)
{
    return (ri < MXROWS(mxp) ) && (ci < MXCOLS(mxp) ) ;

}

/* matrix addition:  a = a + alpha * b   */
void matrix_addition(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, double alpha)
{
    unsigned int i, j, l, n, m;
    double *ap, *bp;

    if (  matrix_same_addsub(mxap, mxbp) ) {
        m = mxap->num_rows;
        n = mxap->num_cols;
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] += alpha * bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }
}


/* matrix subtraction:  a = a - alpha * b   */
void matrix_subtract(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, double alpha)
{
    unsigned int i, j, l, n, m;
    double *ap, *bp;

    m = mxap->num_rows;
    n = mxap->num_cols;
    if ( matrix_same_addsub(mxap, mxbp)  ) {
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] -= alpha * bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }

}


/* matrix addition:  c = a + alpha * b   */
void matrix_addition3(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, struct matrix_dsc_s *mxcp, double alpha)
{
    unsigned int i, j, l, n, m;
    double *ap, *bp, *cp;

    if ( matrix_same_addsub(mxap, mxbp) ) {
        m = mxap->num_rows;
        n = mxap->num_cols;
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            cp = mxcp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                cp[j] = ap[j] + alpha * bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }
}


/* matrix subtraction:  c = a - alpha * b   */
void matrix_subtract3(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, struct matrix_dsc_s *mxcp, double alpha)
{
    unsigned int i, j, l, n, m;
    double *ap, *bp, *cp;

    if ( matrix_same_addsub(mxap, mxbp) ) {
        m = mxap->num_rows;
        n = mxap->num_cols;
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            cp = mxcp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                cp[j] = ap[j] - alpha * bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }

}

/* 
 *  matrix add up all rows:
 *    mxp: the matrix whose rows are being summed up.
 *   smxp: the sum result
 */
void  matrix_rows_sum(struct matrix_dsc_s *mxp, struct matrix_dsc_s  *smxp)
{
    unsigned int i, j, l, n, m, mc;
    double **rpp, *rp, **srpp, *srp;

	mc = MXCOLS_EQ(mxp, smxp);
    if ( !mc ) {

	} else {
    	
		m = MXROWS(mxp);     rpp = MXRPP(mxp);
    	n = MXCOLS(mxp);    srpp = MXRPP(smxp);
		matrix_clear(smxp);  srp = srpp[0];
       
		for ( i = 0; i < m; i++ ) {
            rp = rpp[i]; 
            for ( j = 0; j < n; j++ ) {
                srp[j] += rp[j];
            }
        }
	}
}

void  matrix_rows_avg(struct matrix_dsc_s *mxp, struct matrix_dsc_s *amxp)
{
	double n = (double)( MXROWS(mxp) );
	
	matrix_rows_sum(mxp, amxp);
	matrix_scale(amxp, 1./n);
}

struct matrix_dsc_s *matrix_delta(struct matrix_dsc_s *mxp, struct matrix_dsc_s *aamxp)
{
	unsigned int i, j, m, n, mc;
	double **drpp, *drp, **rpp, *rp, **arpp, *arp;
	struct matrix_dsc_s *dmxp, *amxp;
	
	if (aamxp )
		mc = MXCOLS_EQ(mxp, aamxp);
	else
		mc = 1;

	if ( !mc ) {
		DBG("error of amxp # of columns");
	} else {

		m = MXROWS(mxp);    rpp = MXRPP(mxp);
		n = MXCOLS(mxp);  
		if ( aamxp )  {
			amxp = aamxp;
		} else {
			amxp = matrix_create("averge of rows", 1, n);
		}
		arpp = MXRPP(amxp);
		
		dmxp = matrix_create("variance", m, n);
		drpp = MXRPP(dmxp);  arp = arpp[0]; 
		
		matrix_rows_avg(mxp, amxp);

		for ( i = 0; i < m; i++ ) {
			 rp = rpp[i];
			drp = drpp[i];
			for ( j = 0; j < n; j++ ) {
				drp[j] = rp[j] - arp[j];
			}
		}

		DBG("average of rows is: ");
		matrix_print(amxp);
		if ( !aamxp ) 
			matrix_dsc_fini(amxp);
	}

	return dmxp;
}

/*
 *  Compute the covariance matrix of mxp.
 *
 */
struct matrix_dsc_s *matrix_covar(struct matrix_dsc_s *mxp)
{
	unsigned int m, n;
	double s;
	struct matrix_dsc_s *deltamxp, *covmxp;
	
	m = MXROWS(mxp);   s = 1.0 / (double) (m - 1);
	n = MXCOLS(mxp);
	
	deltamxp = matrix_delta(mxp, NULL);
	covmxp = matrix_mul_aa(deltamxp, MXMUL_TYPE_ATA);
	
	matrix_scale(covmxp, s);

	return covmxp;
}

/*
 *  Compute the covariance matrix with respect to an average row-vector.
 *  The row-vector containing the average values is still stored in a matrix_dsc_s {}.
 *
 */
struct matrix_dsc_s *matrix_covar_wrt_amxp(struct matrix_dsc_s *mxp, struct matrix_dsc_s *amxp)
{
	unsigned int m, n;
	double s;
	struct matrix_dsc_s *deltamxp, *covmxp;
	
	m = MXROWS(mxp);   s = 1.0 / (double) (m - 1);
	n = MXCOLS(mxp);
	
	deltamxp = matrix_delta(mxp, amxp);
	covmxp = matrix_mul_aa(deltamxp, MXMUL_TYPE_ATA);
	
	matrix_scale(covmxp, s);

	return covmxp;
}

/* matrix scale:  a = beta * a,  beta is a scalar.    */
void matrix_scale(struct matrix_dsc_s *mxp, double beta)
{
    unsigned int i, j, l, n, m;
    double *ap, *bp;

    m = MXROWS(mxp);
    n = MXCOLS(mxp);
    if ( beta != 1.0 ) {
        for ( i = 0; i < m; i++ ) {
            ap = mxp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] *= beta;
            }
        }
    }
}

/** 
 *
 * For a column vector, it would be inefficient to compute its 
 * dot product c.T * c in normal matrix operation, as that
 * would require the transpose of the vector. Thus, compute 
 * it directly as follows. 
 *
 **/
double matrix_dotproduct(struct matrix_dsc_s *mxp) 
{
    unsigned int i, m, n;
    double  **rpp, *rp, dotproduct = 0.0;

    m = mxp->num_rows;
    n = mxp->num_cols; //  must be 1 though
    rpp = mxp->rowp;

    if ( m == 1 ) {  // This is a row vector.
        rp = rpp[0];
        for ( i = 0; i < n; i++ ) {
            dotproduct += rp[i] * rp[i];
        }
    } else if ( n == 1 ) {    // This is a column vector.
        for ( i = 0; i < m; i++ ) {
            rp = rpp[i];
            dotproduct += rp[0] * rp[0];
        }
    } else {
        DBG("This is neither a column nor a row vector. Nothing as done.");
    }

    return dotproduct;
}

double matrix_dotproduct_2colvec(struct matrix_dsc_s *mxp, struct matrix_dsc_s *mx2p) 
{
    unsigned int i, m, n;
    double  **rpp, *rp, **r2pp, *r2p, dotproduct = 0.0;

    m = mxp->num_rows;
    n = mxp->num_cols; //  must be 1 though
    rpp = mxp->rowp;
    r2pp = mx2p->rowp;
    
    if ( (m == mx2p->num_rows) && (n == mx2p->num_cols) ) {
        if ( m == 1 ) {  // This is a row vector.
            rp = rpp[0];
            r2p = r2pp[0];
            for ( i = 0; i < n; i++ ) {
                dotproduct += rp[i] * r2p[i];
            }
        } else if ( n == 1 ) {    // This is a column vector.
            for ( i = 0; i < m; i++ ) {
                rp  = rpp[i];
                r2p = r2pp[i];
                dotproduct += rp[0] * r2p[0];
            }
        } else {
            DBG("This is neither a column nor a row vector. Nothing as done.");
        }
    
    } else {
        sprintf(dbg, "These two vectors are not of the same dims: %s -- %s",
            mxp->id, mx2p->id); DBG(dbg); 
    }
    
    return dotproduct;
}    

/********************************************/

/* matrix addition:  A = A + b   */
void matrix_op_add(struct matrix_op_s *mop)
{
    unsigned int i, j, k, l, n, m;
    struct matrix_dsc_s *mxap, *mxbp;
    double *ap, *bp;

    mxap = mop->mpp[ mop->ia ];
    mxbp = mop->mpp[ mop->ib ];

    if ( matrix_same_addsub(mxap, mxbp) ) {
        m = mxap->num_rows;
        n = mxap->num_cols;
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] += bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }

}

/* matrix addition:  A = A + alpha * b   */
void matrix_add_scale(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, double alpha)
{
    unsigned int i, j, k, l, n, m;
    double *ap, *bp;

    m = MXROWS(mxap);
    n = MXCOLS(mxbp);

    if ( matrix_same_addsub(mxap, mxbp) ) {
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] += alpha * bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }
}



/*
 *  Matrix element-wise multiplication ( the same as A = A.*B in Octave)
 *  (similar to a matrix addition, but is is multiplication)
 *
 *  A  =  A * B    
 */
void matrix_ewop_mul(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, double alpha)
{
    unsigned int i, j, k, l, n, m;
    double *ap, *bp;


    if ( matrix_same_addsub(mxap, mxbp) ) {
        m = MXROWS(mxap);
        n = MXCOLS(mxbp);
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] *= alpha * bp[j];
            }
        }

    } else {
        MXNOP(mxap, mxbp);
    }
}

double matrix_ewop_sum(struct matrix_dsc_s *mxp) 
{
    unsigned int i, j, n, m;
	double **rpp, *rp, sum = 0;

	m = MXROWS(mxp);
	n = MXCOLS(mxp);
	rpp = MXRPP(mxp);
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			sum += rp[j];
		}
	}

	return sum;
}

/*
 *  Matrix element-wise summing up each squared element value
 *   and return the sum.
 */
double matrix_ewop_sqrtsum(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, m, n;
    double **rpp, *rp, sqrtsum = 0.0;

    m = MXROWS(mxp);   rpp = MXRPP(mxp);
    n = MXCOLS(mxp);   
    for ( i = 0; i < m; i++ ) {
        rp = rpp[i];
        for ( j = 0; j < n; j++ ) {
            sqrtsum += rp[j] * rp[j];
        }
    }

    return sqrtsum;
}



/**------------- matrix addition:  A = A - alpha * b  -------------*/
void matrix_sub_scale(struct matrix_dsc_s *mxap, struct matrix_dsc_s *mxbp, double alpha)
{
    unsigned int i, j, k, l, n, m;
    double *ap, *bp;

    m = MXROWS(mxap);
    n = MXCOLS(mxbp);

    if (  (m == MXROWS(mxbp)) && (n == MXCOLS(mxbp)) ) {
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] -= alpha * bp[j];
            }
        }

    } else {
        sprintf(dbg, " number of matrices rows and cols does not match cannot operator on them = A (%d, %d), B(%d, %d)",
            m, n, MXROWS(mxbp), MXCOLS(mxbp) ); DBG(dbg); 
    }
}

/* matrix subtraction:  A = A - B   */
void matrix_op_sub(struct matrix_op_s *mop)
{
    unsigned int i, j, k, l, n, m;
    struct matrix_dsc_s *mxap, *mxbp;
    double *ap, *bp;

    mxap = mop->mpp[ mop->ia ];
    mxbp = mop->mpp[ mop->ib ];

    m = mxap->num_rows;
    n = mxap->num_cols;
    if (  m == mxbp->num_rows && n == mxbp->num_cols ) {
        for ( i = 0; i < m; i++ ) {
            ap = mxap->rowp[i]; 
            bp = mxbp->rowp[i]; 
            for ( j = 0; j < n; j++ ) {
                ap[j] -= bp[j];
            }
        }

    } else {
        sprintf(dbg, " number of matrices rows and cols does not match cannot operator on them = A (%d, %d), B(%d, %d)",
            m, n, mxbp->num_rows, mxbp->num_cols ); DBG(dbg); 
    }

}

/*
 *   Compare the elements in two matrixes.
 *   return 1 if they identical, 0 otherwise.
 */
unsigned int matrix_op_cmp(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m, same = 1;
    struct matrix_dsc_s *mxap, *mxbp;
    double *ap, *bp, a, b;

    mxap = mop->mpp[ mop->ia ];
    mxbp = mop->mpp[ mop->ib ];

    n = mxap->num_rows;
    m = mxap->num_cols;
    if ( ( n == mxbp->num_rows) &&  ( m == mxbp->num_cols) ) {
        for ( i = 0; i < n; i++ ) {
            ap = mxap->rowp[i];
            bp = mxbp->rowp[i];
            for ( j = 0; j < m; j++ ) {
                a = ap[j];
                b = bp[j];
                if ( a != b ) {
                    same = 0;
                    break;
                }
            }

            if ( !same )
                break;
        }

    } else 
        same = 0;

    return same;
}

/*
 *   Duplicate matrix A 
 */
void matrix_op_dup(struct matrix_op_s *mop)
{
    unsigned int i, j, n, m;
    struct matrix_dsc_s *mxap, *mxbp;
    double *ap, *bp; 
    char buf[256];

    mxap = mop->mpp[ mop->ia ];
    sprintf(buf, "duplicate of matrix %s", mxap->id); 
    n = mxap->num_rows;
    m = mxap->num_cols;

    mxbp = matrix_create(buf, n, m);    
    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];
        bp = mxbp->rowp[i]; 
        for ( j = 0; j < m+1; j++ ) { // Including the augmented column too
            bp[j] = ap[j];
        }
    }

    mop->idup = matrix_add_mxp(mop, mxbp);
}


struct matrix_dsc_s *matrix_dup(struct matrix_dsc_s *mxap)
{
    unsigned int i, j, n, m;
    struct matrix_dsc_s *mxbp;
    double *ap, *bp; 
    char buf[256];

    n = MXROWS(mxap);   m = MXCOLS(mxap);
    
    sprintf(buf, "duplicate of matrix %s", mxap->id); 
    mxbp = matrix_create(buf, n, m);

    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];
        bp = mxbp->rowp[i];
        for ( j = 0; j < m+1; j++ ) { // Including the augmented column too
            bp[j] = ap[j];
        }
    }
    
    return mxbp;
}

struct matrix_dsc_s *matrix_dup2(unsigned char *newid, struct matrix_dsc_s *mxap)
{
    unsigned int i, j, n, m;
    struct matrix_dsc_s *mxbp;
    double *ap, *bp; 

    n = MXROWS(mxap);   m = MXCOLS(mxap);
    mxbp = matrix_create(newid, n, m);

    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];
        bp = mxbp->rowp[i];
        for ( j = 0; j < m+1; j++ ) { // Including the augmented column too
            bp[j] = ap[j];
        }
    }
    
    return mxbp;
}

/*----- Create a new matrix with the same the dims of mxap ------ */
struct matrix_dsc_s *matrix_dup_dims(unsigned char *newid, struct matrix_dsc_s *mxap)
{
    unsigned int m, n;
    struct matrix_dsc_s *mxbp;

    m = MXROWS(mxap);   n = MXCOLS(mxap);
    mxbp = matrix_create(newid, m, n);
    
    return mxbp;
}


/** --- Copy values for matrix A to matrix B --- **/
void matrix_value_copy(struct matrix_dsc_s *maxp, struct matrix_dsc_s *mbxp)
{
    unsigned int i, j, m, n;
    double **frompp, *fromp, **topp, *top;
    
    m = maxp->num_rows;
    n = maxp->num_cols;
    if ( (m == mbxp->num_rows) && (n == mbxp->num_cols) ) {
        frompp = maxp->rowp;
        topp   = mbxp->rowp;
        for ( i = 0; i < m; i++ ) {
            fromp = frompp[i];
            top   = topp[i];
            for ( j = 0; j < n; j++ ) {
                top[j] = fromp[j];    
            }
        }
    } else {
        sprintf(dbg, "The two matrices are of differnt size = %s (%d x %d)  %s (%d x %d)",
        maxp->id, m, n, mbxp->id, mbxp->num_rows, mbxp->num_cols); 
        DBG(dbg); 
        DBG("Sorry, nothing could be done. ");
    }
}

void matrix_op_aug(struct matrix_op_s *mop)
{
    unsigned int i, n, m;
    struct matrix_dsc_s *mxap, *mxbp;
    double *ap, *bp;

    mxap = mop->mpp[ mop->ia ];
    mxbp = mop->mpp[ mop->ib ];

    n = mxap->num_rows;
    m = mxap->num_cols;
    mxap->aug = 1;
    for ( i = 0; i < n; i++ ) {
        ap = mxap->rowp[i];
        bp = mxbp->rowp[i];
        ap[m] = bp[0];
    }

}

/* ---------- sort along column 0 (the first column by ascending order, the default) -------- */
int matrix_cmp_rows(const void *spp1, const void *spp2)
{
    int v  = 0;
    double *p1 = *(double**)spp1,
           *p2 = *(double**)spp2, d;
    
    d = p1[0] - p2[0];
	if ( d > 0 )
		v = 1;
	else if ( d < 0)
		v = -1;
    
	return v;
}


/* ---------- sort along column 1 (the second column by ascending order) -------*/
int matrix_cmp_rows_c1(const void *spp1, const void *spp2)
{
    int v  = 0, c = 1;
    double *p1 = *(double**)spp1,
           *p2 = *(double**)spp2, d;
    
    d = p1[c] - p2[c];
	if ( d > 0 )
		v = 1;
	else if ( d < 0)
		v = -1;
    
	return v;
}


/* ---------- sort along column 2 (the third column by ascending order) -------*/
int matrix_cmp_rows_c2(const void *spp1, const void *spp2)
{
    int v  = 0, c = 2;
    double *p1 = *(double**)spp1,
           *p2 = *(double**)spp2, d;
    
    d = p1[c] - p2[c];
	if ( d > 0 )
		v = 1;
	else if ( d < 0)
		v = -1;
    
	return v;
}

/* ---------- sort along column 3 (the fourth column by ascending order) -------*/
int matrix_cmp_rows_c3(const void *spp1, const void *spp2)
{
    int v  = 0, c = 3;
    double *p1 = *(double**)spp1,
           *p2 = *(double**)spp2, d;
    
    d = p1[c] - p2[c];
	if ( d > 0 )
		v = 1;
	else if ( d < 0)
		v = -1;
    
	return v;
}

void matrix_op_sort_A(struct matrix_op_s *mop)
{
    unsigned int n;
    struct matrix_dsc_s *mxp; 

    mxp = mop->mpp[ mop->ia];
    
    n = mxp->num_rows;
    qsort(mxp->rowp, n, sizeof(double *), matrix_cmp_rows);

}

/** --------- Sort the rows in a matrix by column ci --------- **/
void matrix_sort_mxp(struct matrix_dsc_s *mxp, unsigned int ci)
{
    unsigned int n;
	int (*matrix_sortfunc_cmp)(const void *, const void *);

	if ( ci > 4 ) {
		sprintf(dbg,
			"need to define function: int matrix_cmp_rows_c3\"%d\"() ", ci);
		DBG(dbg);
		return ;
	}
	
	switch(ci) {
	case 0:  matrix_sortfunc_cmp = matrix_cmp_rows;    break;
	case 1:  matrix_sortfunc_cmp = matrix_cmp_rows_c1; break;
	case 2:  matrix_sortfunc_cmp = matrix_cmp_rows_c2; break;
	case 3:  matrix_sortfunc_cmp = matrix_cmp_rows_c3; break;

	default: DBG("not implemented yet.");              break;
	}

    n = MXROWS(mxp);
    qsort(mxp->rowp, n, sizeof(double *), matrix_sortfunc_cmp);
}

/*
 *   Gaussian (Jordan) Elimination
 */
void matrix_op_gje(struct matrix_op_s *mop)
{
    unsigned int i, j, k, n, m; 
    struct matrix_dsc_s *mxap;
    double *ap, *bp, mu;

    mxap = mop->mpp[ mop->ia ];

    n = mxap->num_rows;
    m = mxap->num_cols;
    for ( i = 0; i < n-1; i++ ) {
        ap = gje_sub_get_ap(mxap, i);
        if ( mxap->not_li ) 
            break;
        for ( j = i+1; j < n ; j++ ) {  // Need to deal with the augmented column too
            bp = mxap->rowp[j];
            
            if ( bp[i] == 0.0 ) 
                continue;
            
            mu = bp[i] / ap[i];
            //sprintf(dbg, " bp[] = %g  ap[] = %g mu = %g", bp[j-1], ap[j-1], mu); DBG(dbg); 
            for ( k = i; k < m+1; k++ ) {
                bp[k] -= ap[k] * mu; 
            }
        }
    }

    if ( !mxap->not_li ) {
        //DBG("Last row ==> ");  vector_print2(n, bp);
        if ( bp[m-1] == 0.0 ) {
            mxap->not_li = 1;   // This matrix A is not linearly independent.
        }
    }
}

/** The pivot in matrix A[n-1, n-1] cannot be 0.  So if we encounter the 0
 * case, we need to switch the rows below rowp[i] to get the pivot nonzero. **/
double *gje_sub_get_ap(struct matrix_dsc_s *mxp, unsigned int i)
{
    unsigned int j, n; 
    double *ap, *aap;
    
    ap = mxp->rowp[i];

    // The the pivot ap[i] cannot be 0.
    if ( ap[i] == 0.0 ) {
        n = mxp->num_rows;
        for ( j = i+1; j < n; j++ ) {
            aap = mxp->rowp[j];
            if ( aap[i] != 0.0 ) {
                ap = aap;
                matrix_swrow(mxp, i, j);  // switch row i and row j.
            }
        }
    }

    if ( ap[i] == 0.0 ) {
        mxp->not_li = 1;
    }

    return ap;
}

void lud_sub_get_ap_ip(struct matrix_dsc_s *mxp, struct matrix_dsc_s *mxlp, unsigned int i, double **app, double **ipp)
{
    unsigned int j, n;
    double *ap, *aap;
    
    *app = mxp->rowp[i];
    *ipp = mxlp->rowp[i];

    // The the pivot ap[i] cannot be 0.
    ap = app[i];
    if ( ap[i] == 0.0 ) {
        n = mxp->num_rows;
        for ( j = i+1; j < n; j++ ) {
            aap = mxp->rowp[j];
            if ( aap[i] != 0.0 ) {
                *app = aap;
                *ipp = mxlp->rowp[j];
                matrix_swrow(mxp, i, j);  // switch row i and row j.
                matrix_swrow(mxlp, i, j);
            }
        }
    }
}

void matrix_op_bt(struct matrix_op_s *mop)
{
    int i, n, m; 
    struct matrix_dsc_s *mxap;
    double *ap, mu, rs;

    mxap = mop->mpp[ mop->ia ];
    n = mxap->num_rows;    
    m = mxap->num_cols;    
    
    //sprintf(dbg, " n = %d m = %d", n, m); DBG(dbg); 
    for ( i = n-1; i >= 0; i-- ) {
        ap = mxap->rowp[i];
        mu = ap[i];

        rs = matrix_bt_get_rs(mop, i);
        //sprintf(dbg, " i  = %d, mu = %g rs = %g", i, mu, rs); DBG(dbg); 
        ap[i] = 1.0;
        ap[m] = (ap[m] - rs )/ mu;
    }

}

/*
 *  LU forward substitution of y matrix
 *  Compute the sum of all the known yi value in the current row.
 *  stop just before y[idx], which is being solved right now.
 */
double matrix_luft_get_rs(struct matrix_op_s *mop, unsigned int idx)
{
    unsigned int i;
    struct matrix_dsc_s *mxp, *mxyp;
    double *rsp, *yp, rs = 0.0;

    mxp  = mop->mpp[ mop->ia ];
    mxyp = mop->mpp[ mop->iy ];
    
    rsp = mxp->rowp[idx];
    
    for ( i = 0; i < idx; i++ ) {
        yp = mxyp->rowp[i];
        rs += rsp[i] * yp[0];
    }
    
    return rs;
}

/*
 *  LU backward substitution of x matrix, x matrix contains the solution.
 *  Compute the sum of all the known xi value in the current row.
 *  stop just before x[idx], which is being solved right now.
 */
double matrix_lubt_get_rs(struct matrix_op_s *mop, unsigned int idx)
{
    unsigned int i, n;
    struct matrix_dsc_s *mxap, *mxp;
    double *ap, *xp, rs = 0.0;

    mxap = mop->mpp[ mop->iu ];
    mxp  = mop->mpp[ mop->ix ];
    n = mxp->num_rows;

    ap = mxap->rowp[idx];        
    for ( i = n-1; i > idx; i-- ) {
        xp = mxp->rowp[i];
        rs += ap[i] * xp[0];
    }

    return rs;
}

double matrix_bt_get_rs(struct matrix_op_s *mop, unsigned int idx)
{
    unsigned int i, n, m;
    struct matrix_dsc_s *mxp;
    double *rsp, *xp, rs = 0.0;

    mxp = mop->mpp[ mop->ia ];
    n = mxp->num_rows;
    m = mxp->num_cols;
    rsp = mxp->rowp[idx];
    
    //sprintf(dbg, " n-1 = %d, idx = %d rs = %g", n-1, idx, rs); DBG(dbg); 
    for ( i = n-1; i > idx; i-- ) {
        //sprintf(dbg, " i = %d, idx = %d rs = %g", i, idx, rs); DBG(dbg); 
        xp = mxp->rowp[i];
        rs += rsp[i] * xp[m];
    }

    return rs;
}

/* 
 *   This is a convenience routine to 
 *  void  matrix2row_echelon(struct matrix_dsc_s *mxp)
 */

unsigned int matrix_pivot_nonzero(struct matrix_dsc_s *mxp, unsigned int rn)
{
    unsigned int i, j, m, n,  cn, done = 0;
    double *rp, *rp2, **rpp;
    
    rpp = mxp->rowp;
    rp  = rpp[rn];
    cn  = rn;
    if ( rp[cn] == 0.0 ) {
        i = rn;
        for ( j = i+1; j < m; j++ ) {
            rp2 = rpp[j];
            if ( rp2[cn] != 0.0 ) {
                sprintf(dbg, "matrix %s: switching %d %d",
                    mxp->id, rn, j); DBG(dbg); 
                matrix_swrow(mxp, rn, j);
                done = 1;
                break;
            }
        }
    } else
        done = 1;

    return done;
}

/* 
 *   Convert a matrix to row echelon form. 
 */

void  matrix2row_echelon(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, m, n, k;
    double *rp, *nextrp, **rpp;
    double mu, v, u;

    matrix_print(mxp);
    rpp = mxp->rowp;
    m = mxp->num_rows;
    n = mxp->num_cols;

    for ( i = 0; i  < m-1; i++) {
        
        rp = rpp[i];
        for ( j = i+1; j < m; j++ ) {
            nextrp = rpp[j];
            mu = nextrp[i] / rp[i];
            for ( k = i; k < n; k++ ) {
                nextrp[k] -= rp[k] * mu;
			}
		}
    }
    matrix_print(mxp);
}
/*
 *   Solve the matrix equation  A * x = b
 *   Use the LU decomposition method.
 */
void matrix_solver2(struct matrix_op_s *mop)
{
    unsigned int  n;
    struct matrix_dsc_s *mxp, *mxap;

    matrix_op_dup(mop);    // Make a backup copy of the original A
    //matrix_op_sort_A(mop); // Sort matrix A
    
    mxap = mop->mpp[ mop->ia ];
    n = mxap->num_rows;
    mxp = matrix_create("Matrix x (solution) ", n, 1);
    matrix_add_mxp(mop, mxp);
    mop->ix = mop->num_matrixes - 1;

    matrix_op_lud(mop);
    // By this step,  L and U are pointed to by 
    //   mop->mpp[ mop->il ] and mop->mpp[ mop->iu ] respectively
    //
    matrix_op_luft(mop); 
    matrix_op_lubt(mop); 

}

/*
 *   The forward substitution after the LU decomposition to generate y matrix
 *
 */
void matrix_op_luft(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *mxlp, *mxyp, *mxbp;
    unsigned int i, n, m, idx;
    double rs, ai, bi, yi, *brp, *yrp, *ap;

    mxlp = mop->mpp[ mop->il ];
    mxbp = mop->mpp[ mop->ib ];
    n = mxlp->num_rows;
    m = mxlp->num_cols;

    mxyp = matrix_create("LU y matrix", n, 1);
    matrix_add_mxp(mop, mxyp);
    mop->iy = mop->num_matrixes - 1;
    
    idx = mop->ia;
    mop->ia = mop->il;

    for ( i = 0; i < n; i++ ) {
        rs = matrix_luft_get_rs(mop, i);
        brp = mxbp->rowp[i];
        yrp = mxyp->rowp[i];
        ap = mxlp->rowp[i];
        ai = ap[i];
        bi = brp[0] - rs;
        yi = bi / ai;
        yrp[0] = yi;
        //sprintf(dbg, " i =%d  rs= %g bi = %g ai = %g yi = %g", i, rs, bi, ai, yi); DBG(dbg); 
    }
    
    mop->ia = idx;
}

/*
 *   The backward substitution after the LU decomposition and
 *   the computation of y matrix to generate x matrix
 *
 */
void matrix_op_lubt(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *mxup, *mxp, *mxbp;
    int i, n, m;
    double mu, rs, *bp, *xp, *ap;

    mxup = mop->mpp[ mop->iu ]; // LU U matrix,   A in  A x = b
    mxbp = mop->mpp[ mop->iy ]; // Here b matrix  b in  A x = b
    mxp  = mop->mpp[ mop->ix ]; // Here x matrix  x in  A x = b
    n = mxup->num_rows;         // The solution is in x after 
    m = mxup->num_cols;         // the routine finishes execution.
    
    matrix_print(mxup);
    matrix_print(mxbp);
    sprintf(dbg, "n = %d m = %d i(u, y, x)=(%d, %d, %d)", n, m,
        mop->iu, mop->iy, mop->ix ); DBG(dbg); 

    for ( i = n-1; i >= 0; i-- ) {
        ap = mxup->rowp[i];
        bp = mxbp->rowp[i];
        xp = mxp->rowp[i];

        mu = ap[i];
        rs = matrix_lubt_get_rs(mop, i);
        xp[0] = ( bp[0] - rs ) / mu;
        sprintf(dbg, "i = %d, rs = %g bp[0] =%g xp[0]=%g mu=%g", 
            i, rs, bp[0], xp[0], mu); DBG(dbg); 
    }

}

/*
 *   Solve the matrix equation  A * x = b
 *   Use the Gaussian Elimination method.
 */
void matrix_solver(struct matrix_op_s *mop)
{
    struct matrix_dsc_s *Amxp, *mxp, *bmxp;

    Amxp = mop->mpp[mop->ia];
    bmxp = mop->mpp[mop->ib];
    
    matrix_op_aug(mop);        // Augment matrix A

    matrix_op_gje(mop);        // Gaussian Jordan Elimination 
    
    if ( !Amxp->not_li ) {
        matrix_op_bt(mop);     // Backtrace with substitution
        matrix_gen_sol(mop);   // Generate solution matrix x
    }
}


void matrix_gen_sol(struct matrix_op_s *mop)
{
    unsigned int i, n, m;
    double *ap, *bp;
    struct matrix_dsc_s *maxp, *mxp;

    maxp = mop->mpp[ mop->ia ];
    n = maxp->num_rows;
    m = maxp->num_cols;
    if ( mop->new_xm ) {
        mxp = matrix_create("solution matrix x", n, 1);
        mop->ix = matrix_add_mxp(mop, mxp);
    } else {
        mxp = mop->mpp[ mop->ix ];
    }

    for ( i = 0; i < n; i++ ) {
        ap = maxp->rowp[i];
        bp = mxp->rowp[i];
        bp[0] = ap[m];
    }

}

/*
 *  Print the solution matrix x.
 *
 */
void matrix_print_sol(struct matrix_op_s *mop)
{
    matrix_print( mop->mpp[mop->ix] );
}



/**-------------------------- NLA begin ---------------------------**/

/*--  Given an n-element vector xp, compute its Householder vector and beta --*/
double *vector_housevec(unsigned int n, double *xp, double *beta)
{
	unsigned int i, m;
	double sigma, mu, b, x0, x02, *p2, *hvp = NULL;
	
	if ( n < 2 ) {
		sprintf(dbg, "Error n = %d, must be >= 2", n); DBG(dbg); 
	
	} else {
		m = n - 1;
		p2 = xp + 1;
		sigma = vectors_inner_prod(m, p2, p2);
		hvp = vector_dup2(n, xp);
		hvp[0] = 1.0;
		x0 = xp[0];
		
		if ( 1 ) { 
			sprintf(dbg, " sigma= %g", sigma); DBG(dbg); 
			vector_sprint("v==>", n, hvp);
		}
		
		if ( sigma == 0.0 && x0 >= 0.0 ) {
				b =  0.0;
		} else if  ( sigma == 0.0 && x0 < 0.0 ) {
				b = -2.0;
		} else {
			mu = sqrt(x0 * x0 + sigma);
			if ( x0 <= 0.0 ) {
				hvp[0] = x0 - mu;
			} else {
				hvp[0] = -sigma/(x0 + mu);
			}
			
			x0 = hvp[0];
			x02 = x0 * x0;
			b = 2 * x02 /(sigma + x02);
			for ( i = 0; i < n; i++ ) {
				hvp[i] /= x0;
			}
		}
		*beta = b;
	}

	return hvp;
}

/** -- the same as vector_housevec() except that hvp is pre-allocated 
 *   xp is a normalized HH vector. 
 *
 * -- **/

void  vector_housevec2(unsigned int n, double *xp, double *hvp, double *beta)
{
	unsigned int i, m;
	double sigma, mu, x0, x02, *p2, b;
	
	if ( n < 2 ) {
		sprintf(dbg, "Error n = %d, must be >= 2", n); DBG(dbg); 
	
	} else {
		m = n - 1;
		p2 = xp + 1;
		sigma = vectors_inner_prod(m, p2, p2);
		memcpy(hvp, xp, n * sizeof(double));
		hvp[0] = 1.0;
		x0 = xp[0];
		
		if ( sigma == 0.0 && x0 >= 0.0 ) {
				b =  0.0;
		} else if  ( sigma == 0.0 && x0 < 0.0 ) {
				b = -2.0;
		} else {
			mu = sqrt(x0 * x0 + sigma);
			if ( x0 < 0.0 ) {
				hvp[0] = x0 - mu;
			} else {
				hvp[0] = -sigma/(x0 + mu);
			}
			
			x0  = hvp[0];
			x02 = x0 * x0;
			b   = 2 * x02 /(sigma + x02);
			for ( i = 0; i < n; i++ ) {
				hvp[i] /= x0;
			}
		}
		*beta = b;
		//vector_sprint("house vector=> ", n, hvp);
	}
}

/**-- Compute the P = I - beta * v * v', the Householder matrix --**/
// 
// In practice, this P matrix may never be formed explicitly 
// when computing either P * A or A * P, rather use 
// 		
// 		A - beta * v * (v' * A) 
// or
// 		A - (A * v) * (beta * v'). 
//  
//  where v' is the transpose of v.
//
//
struct matrix_dsc_s *matrix_housemat(struct matrix_op_s *mop, unsigned int n, double *vp, double beta)
{
	unsigned int i, j;
	struct matrix_dsc_s *mxp;
	double **rpp, *rp, t;
	char id[BUF_LEN];

	sprintf(id, "Householder P, beta = %g", beta);
	mxp = matrix_create(id, n, n);
	rpp = MXRPP(mxp);

	for ( i = 0; i < n; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			t = beta * vp[i] * vp[j];
			if ( i != j ) {
				rp[j] = -t;
			} else {
				rp[j] = 1.0 - t; 
			}
		}
	}
	
	if ( mop )
		matrix_add_mxp(mop, mxp);

	return mxp;
}

/*
 *  Map the submxp matrix "shadow" structure onto the hmxp matrix. 
 *  The size of "shadow"
 */
void matrix_submat_map(struct matrix_dsc_s *hmxp, struct matrix_dsc_s *submxp, unsigned int moff, unsigned int noff)
{
	unsigned int i, j, m, n, hm, hn;
	double **hrpp, *hrp, **srpp, *srp;
	
	m = MXROWS(submxp);   hm = MXROWS(hmxp);
	n = MXCOLS(submxp);   hn = MXCOLS(hmxp);
	hrpp = MXRPP(hmxp);
	srpp = MXRPP(submxp);
	
	if ( ((m + moff) <= hm) && ( (n + noff) <= hn) ) {
		for ( i = 0; i < m; i++ ) {
			hrp = hrpp[i + moff];
			srpp[i] = hrp + noff;
		}
	} else {
		sprintf(dbg, "Error: (m, n) offsets =(%d, %d), submxp size(%d, %d) while hmxp size(%d, %d)", 
			moff, noff, m, n, hm, hn); DBG(dbg); 
		DBG("Nothing was done! Please check the cause of the error.");
	}

}

/*
 *  Map the submxp matrix "shadow" structure onto the hmxp matrix. 
 *  The size of "shadow"
 */
void matrix_submat_map2(struct matrix_dsc_s *hmxp, struct matrix_dsc_s *submxp, unsigned int moff, unsigned int noff)
{
	unsigned int i, j, m, n, hm, hn;
	double **hrpp, *hrp, **srpp, *srp;
	
	hm = MXROWS(hmxp);
	hn = MXCOLS(hmxp);
	
	if ( ( moff < hm) && ( noff < hn ) ) {
		
		m = hm - moff;  
		n = hn - noff;
		matrix_resize(submxp, m, n);

		hrpp = MXRPP(hmxp);
		srpp = MXRPP(submxp);
		for ( i = 0; i < m; i++ ) {
			hrp = hrpp[i + moff];
			srpp[i] = hrp + noff;
		}
	} else {
		sprintf(dbg, "Error: (m, n) offsets =(%d, %d), submxp size(%d, %d) while hmxp size(%d, %d)", 
			moff, noff, m, n, hm, hn); DBG(dbg); 
		DBG("Nothing was done! Please check the cause of the error.");
	}

}


/*
 *    Householder premultiplication
 *    
 *    P * A  =  (I - beta * v * v') * A = A - (beta * v) * (v' * A)
 * 
 *    v is the computed Householder vector.
 */
void matrix_hh_premul(unsigned int n1, double *vp, double beta, struct matrix_dsc_s *amxp, struct matrix_dsc_s *pamxp)
{

	unsigned int i, j, k, m, n;
	double **arpp, *arp, **parpp, *parp, *up, bv, vTA, t, z;

	m = MXROWS(amxp);
	n = MXCOLS(amxp);
	up = MALLOC(n, double);
	
	if ( !up ) {
		DBG("error to allocate storage for up.");
		return;
	}
	
	 arpp = MXRPP(amxp);
	parpp = MXRPP(pamxp);
	
	// 1. Compute v.T * A first, result is a row vector.
	for ( j = 0; j < n; j++ ) {  // Go thru the rows ...
		vTA = 0.0;
		for ( i = 0; i < m; i++ ) { // the inner product btw
			arp = arpp[i];          // vp * A[:, i]
			vTA += vp[i] * arp[j];
		}
		up[j] = vTA;
	}

	// 2. Now we have u = v.T * A, which is considered a column vector.

	for ( i = 0; i < m; i++ ) {
		 arp = arpp[i];
		parp = parpp[i];
		for ( j = 0; j < n; j++ ) {
			parp[j] = arp[j] - beta * vp[i] * up[j];
		}
	}

	free(up);
}

/*
 *    Householder postmultiplication
 *    
 *    A * P  = A * (I - beta * v * v') = A - (beta * A * v) *  v'
 * 
 *    v is the computed Householder vector.
 */
void matrix_hh_postmul(unsigned int n1, double *vp, double beta, struct matrix_dsc_s *amxp, struct matrix_dsc_s *apmxp)
{

	unsigned int i, j, k, m, n;
	double **arpp, *arp, **aprpp, *aprp, *up, bv, Av, t, z;

	m = MXROWS(amxp);
	n = MXCOLS(amxp);

	up = MALLOC(m, double);
	if ( !up ) {
		DBG("error to allocate storage for up.");
		return;
	}
	
	if ( 0 ) {
		matrix_print(amxp);
		sprintf(dbg, "beta = %g", beta); DBG(dbg); 
		vector_sprint("hh vec => ", n, vp);
	}
	 arpp = MXRPP(amxp);
	aprpp = MXRPP(apmxp);
	
	
	// 1. Compute u = beta * A*v, which v is considered a column vector.
	for ( i = 0; i < m; i++ ) {
		arp = arpp[i];      // Use the inner prod of row and column vectors
		Av = vectors_inner_prod(n, arp, vp);
		up[i] = beta * Av;
	}

	if ( 0 )
		vector_sprint("A * v ==>", n, up); 
	
	// 2. Now we have u = A*v, which is considered a column vector.

	for ( i = 0; i < m; i++ ) {
		arp = arpp[i];
		aprp = aprpp[i];
		for ( j = 0; j < n; j++ ) {
			aprp[j] = arp[j] - up[i] * vp[j];
		}
	}

	free(up);
}

void matrix_hh_postmul2(unsigned int n, double *vp, double beta, struct matrix_dsc_s *amxp, struct matrix_dsc_s *apmxp)
{

	unsigned int i, j;
	double **prpp, *prp, t;
	struct matrix_dsc_s *pmxp;

	pmxp = matrix_create("temp", n, n);
	prpp = MXRPP(pmxp);

	// For the the  n-by-n  (I - v * v') matrix.
	for ( i = 0; i < n; i++ ) {
		prp = prpp[i];
		for ( j = 0; j < n; j++ ) {
			t = -beta * vp[i] * vp[j];
			if ( i == j ) {
				t += 1.0;
			}
			prp[j] = t; 
		}
	}
	
	matrix_mul_abc(amxp, pmxp, apmxp);

	matrix_dsc_fini(pmxp);
}



/*
 *    Compute the C and S of plane( Givens) rotation as in
 *                        T 
 *    given a vector [a b] 
 *    +            + T
 *    |   C    S   |
 *    |            |
 *    |  -S    C   |
 *    +            +
 *
 */
void matrix_givens_rotate(double a, double b, double *cp, double *sp)
{
	double r, c, s;
	
	if ( a == 0.0 ) {
		s = 1.0, c = 0;
	} else if ( b == 0.0 ) {
		c = 1.0, s = 0;
	} else {
		if (fabs(b) > fabs(a) ) {
			r = -a/b, s = 1.0/sqrt(1 + r * r ), c = s * r;
		} else {
			r = -b/a, c = 1.0/sqrt(1 + r * r ), s = c * r;
		}
	}

	*cp = c,  *sp = s;
}

void matrix_givens_coord(double a, double b, double *newa, double *newb, double *cp, double *sp)
{
	double c, s, na, nb;
	
	matrix_givens_rotate(a, b, &c, &s);

	na = c * a - s * b;
	nb = s * a + c * b;

	*newa = na;
	*newb = nb;

	if ( cp && sp ) {
		*cp = c;
		*sp = s;
	}
}

/*
 *   Given two points, compute the Givens rotation matrix.
 *   If the rotated Givens matrix is required, it is indicated
 *   by the flag argument
 *
 */
void matrix_givens_compute(struct matrix_dsc_s *gmxp, double a, double b, unsigned int flag_xpose)
{
	double c, s; 
	double **rpp, *rp, *r2p;

	matrix_givens_rotate(a, b, &c, &s);
	rpp = MXRPP(gmxp);
	rp = rpp[0],  r2p = rpp[1];
	
	rp[0] = c,  r2p[1] = c;

	if ( flag_xpose ) {
		rp[1] = -s, r2p[0] =  s;
	} else {
		rp[1] =  s, r2p[0] = -s;
	}
}

/*
 *   For operations of transfer between a matrix and a vector, this 
 *   routine checks whether the specified parameters are within the 
 *   corresponding  boundary.
 *
 */
unsigned int matvec_xfer_sanity(unsigned int rc_type, struct matrix_dsc_s *mxp, unsigned int rc_index, unsigned int ne, unsigned int k)
{
	unsigned int y, m, n, rc_max;
	char *rcstr[] = {"row", "column"}, *rcp;

	y = 1,  m = MXROWS(mxp), n = MXCOLS(mxp);
	
	rc_max = ( rc_type == MATVEC_XTYPE_ROW ) ? m:n;
	rcp    = ( rc_type == MATVEC_XTYPE_ROW ) ? rcstr[0]:rcstr[1];

	// sanity check
	if ( rc_index >=  rc_max ) { 
		sprintf(dbg, "%s index %d, out of %s range [0:%d]", 
			rcp, rcp, rc_index, rc_max-1); DBG(dbg);
		y = 0;
	} else if ( ne > rc_max ) {
		
		sprintf(dbg, "no. elements to retrieve ne= %d, out of %s range [0:%d]", 
			rcp, ne, rc_max-1); DBG(dbg);
		y = 0;
	} else if ( k  > rc_max ) {
		
		sprintf(dbg, "offset to beginning k = %d, out of %s  range [0:%d]", 
			k, rcp,  rc_max-1); DBG(dbg);
		y = 0;
	} else if ( (k+ne) > rc_max ) {
		sprintf(dbg, "k+ne= %d, out of %s range [0:%d]", 
			k+ne, rcp, rc_max-1); DBG(dbg);
		y = 0;
	}

	return y;
}

/*
 *  Retrieve part or whole elements from a specific row in matrix mxp.
 *  Arguments:
 *  input:  ri -- the index to the row whose elements are being retrived
 *          k  -- the offset to the first element in the row
 *          
 *          ne -- the number of elements in the row vector being retrived
 *  output: vp -- the storage where the elements retrieved are stored.
 */
unsigned int matrix_load_rowvec(struct matrix_dsc_s *mxp, unsigned int ri, unsigned int ne, double *vp, unsigned int k)
{
	unsigned int i, j, m, m1, n, y;
	double **rpp, *rp;

	m = MXROWS(mxp);    rpp = MXRPP(mxp);
	n = MXCOLS(mxp);    y = 1;

	y = matvec_xfer_sanity(MATVEC_XTYPE_ROW, mxp, ri, ne, k);

	if ( !y ) {
		DBG("Error, nothing was done");
	} else {
		j = 0;
		m1 = k + ne;
		rp = rpp[ri];
		for ( i = k; i < m1; i++ ) {
			vp[j++] = rp[i];
		}
	}

	return y;
}

unsigned int matrix_save_rowvec(struct matrix_dsc_s *mxp, unsigned int ri, unsigned int ne, double *vp, unsigned int k)
{
	unsigned int i, j, m, n, m1, y;
	double **rpp, *rp;

	m = MXROWS(mxp);    rpp = MXRPP(mxp);
	n = MXCOLS(mxp);    y = 1;

	y = matvec_xfer_sanity(MATVEC_XTYPE_ROW, mxp, ri, ne, k);

	if ( !y ) {
		DBG("Error, nothing was done");
	} else {
		j = 0;
		m1 = k + ne;
		rp = rpp[ri];
		for ( i = k; i < m1; i++ ) {
			rp[i]= vp[j++];
		}
	
	}

	return y;
}


/*
 *  Retrieve part or whole elements from a specific column in matrix mxp.
 *  Arguments:
 *  input:  ci -- the index to the column whose elements are being retrived
 *          k  -- the offset to the first element in the column
 *          
 *          ne -- the number of elements in the column vector being retrived
 *  output: vp -- the storage where the elements retrieved are stored.
 */
unsigned int matrix_load_colvec(struct matrix_dsc_s *mxp, unsigned int ci, unsigned int ne, double *vp, unsigned int k)
{
	unsigned int i, j, m, m1, n, y;
	double **rpp, *rp;

	m = MXROWS(mxp);    rpp = MXRPP(mxp);
	n = MXCOLS(mxp);    y = 1;

	y = matvec_xfer_sanity(MATVEC_XTYPE_COL, mxp, ci, ne, k);

	if ( !y ) {
		DBG("Error, nothing was done");
	} else {
		j = 0;
		m1 = k + ne;
		for ( i = k; i < m1; i++ ) {
			rp = rpp[i];
			vp[j++] = rp[ci];
		}
	
	}

	return y;
}

unsigned int matrix_save_colvec(struct matrix_dsc_s *mxp, unsigned int ci, unsigned int ne, double *vp, unsigned int k)
{
	unsigned int i, j, m, n, m1, y;
	double **rpp, *rp;

	m = MXROWS(mxp);    rpp = MXRPP(mxp);
	n = MXCOLS(mxp);    y = 1;

	y = matvec_xfer_sanity(MATVEC_XTYPE_COL, mxp, ci, ne, k);

	if ( !y ) {
		DBG("Error, nothing was done");
	} else {
		j = 0;
		m1 = k + ne;
		for ( i = k; i < m1; i++ ) {
			rp = rpp[i];
			rp[ci]= vp[j++];
		}
	
	}

	return y;
}


/*
 *   Transform a symmetric square (n x n) matrix mxp into
 *   a tridiagonal matrix via Householder transformation. 
 *
 */

void  matrix_tridiag_house(struct matrix_dsc_s *mxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, ne, ne1, ak, l, am, an;
	struct matrix_dsc_s *amxp, *rmxp, *Qmxp;
	double **rpp, *rp, *vp, *hp, beta, a, s;
	
	m = MXROWS(mxp);
	n = MXCOLS(mxp);

	l = max(m, n);
	vp = MALLOC(l, double);
	hp = MALLOC(l, double);

	// Create the "shadow" matrixe for A and R
	sprintf(id, "shadow of %s", MXID(mxp));
	amxp = matrix_create_rowp_only(id, m, n);

	sprintf(id, "resized %s", MXID(mxp));
	rmxp = matrix_create(id, m, n);
	
	ne = m-1, ak = 1,  // column vector to be HH transformed with 
	am = m,   an = n;  // offset 1, not 0.
	for ( i = 0; i < l-2; i++ ) {
		
		// 1. Map the sub-matrix.    //+===+----- these are row and col. 
		an = i+1;
		matrix_submat_map2(mxp, amxp, an, an); // offsets, respectively.
		// 2. Resize the  matrixes.
		//matrix_resize( rmxp, MXROWS(amxp), MXCOLS(amxp) );
		
		// 3. Load the column or row vector to be HH transformed.
		matrix_load_colvec(mxp, i, ne, vp, ak);
		
		// 4. the norm of the loaded vector save it in the symmetric positions 
		a = sqrt(vectors_inner_prod(ne, vp, vp));
		matrix_set_value(mxp, i, an, a);
		matrix_set_value(mxp, an, i, a);

		// 5. Compute the Householder vector, which is stored in hp.
		vector_housevec2(ne, vp, hp, &beta);
		
		matrix_save_colvec(mxp, i, ne-1, hp+1, ak+1);
		
		// 6. Householder matrix
		matrix_vector_mul(amxp, hp, vp, beta);  // Note vp is overwritten here.
		s = 0.5 * beta * vectors_inner_prod(ne, hp, vp);
		vector_add_scaled(ne, vp, hp, -s);
		
		// 7. Now the rank-2 update of the A matrix
		matrix_hh_rank2(amxp, vp, hp);		

		ne--, ak++, am--, an--;
	}


	matrix_resize(amxp, m-1, n-1);
	matrix_submat_map(mxp, amxp, 1, 0);
	Qmxp = matrix_backaccumQR(amxp, MATVEC_XTYPE_COL);
	matrix_dsc_fini(Qmxp);


	free(vp), free(hp);
	matrix_dsc_fini(amxp);
	matrix_dsc_fini(rmxp);
}

/*
 *  Implement 
 *     matrix * vector = vector
 *    
 *     mxp * up = vp
 */

void matrix_vector_mul(struct matrix_dsc_s *mxp, double *up, double *vp, double scale)
{
	unsigned int i, m, n;
	double **rpp, *rp;
	
	m = MXROWS(mxp);  rpp = MXRPP(mxp);
	n = MXCOLS(mxp);

	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		vp[i] = vectors_inner_prod(n, rp, up) * scale;
	}
}

/*
 *  Householder matrix rank-2 update
 *
 */
void matrix_hh_rank2(struct matrix_dsc_s *mxp, double *vp, double *up)
{
	unsigned int i, j, m, n;
	double **rpp, *rp;

	m = MXROWS(mxp);   // m == n, must be a square matrix
	n = MXCOLS(mxp);   // otherwise, something went wrong.
	rpp = MXRPP(mxp);

	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];	
		for ( j = 0; j < n; j++ ) {
			rp[j] = rp[j] - vp[i] * up[j] - up[i] * vp[j];
		}
	}

}

/*
 *   Transform a matrix mxp into (upper) bidiagonal form
 *   via Householder transformation.
 */

void  matrix_bidiag_house(struct matrix_dsc_s *mxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, ne, ne1, ak, l, am, an;
	struct matrix_dsc_s *amxp, *rmxp;
	double **rpp, *rp, *vp, *hp, beta;
	
	m = MXROWS(mxp);
	n = MXCOLS(mxp);

	l = max(m, n);
	vp = MALLOC(l, double);
	hp = MALLOC(l, double);

	// Create the "shadow" matrixe for A and R
	sprintf(id, "shadow of %s", MXID(mxp));
	amxp = matrix_create_rowp_only(id, m, n);
	
	sprintf(id, "resized %s", MXID(mxp));
	rmxp = matrix_create(id, m, n);
	
	ne = m, ak = 0, am = m, an = n;
	for ( i = 0; i < l-1; i++ ) {
		//matrix_print(mxp);
		
		
		// 1. Map the sub-matrix.  //+==+----- these are row and col. 
		matrix_submat_map2(mxp, amxp, i, i); // offsets, respectively.
		// 2. Resize the  matrixes.
		matrix_resize( rmxp, MXROWS(amxp), MXCOLS(amxp) );
		MXSIZE2(1, amxp);

		// 3. Load the column or row vector to be HH transformed.
		matrix_load_colvec(mxp, i, ne, vp, ak);
		
		// 4. Compute the Householder vector.
		vector_housevec2(ne, vp, hp, &beta);
		
		// 5. Householder premultiplication
		matrix_hh_premul(ne, hp, beta, amxp, rmxp);

		// 6. Copy the result in rmxp back to R via amxp.
		matrix_copy_ab(amxp, rmxp);

		// 7. Load the column HH vector
		matrix_save_colvec(mxp, i, ne-1, hp+1, ak+1);
		
		if ( i+2 < n ) {
			ne1 = ne - 1;
			
			matrix_submat_map2(mxp, amxp, i, i+1); 
			matrix_resize(rmxp, MXROWS(amxp), MXCOLS(amxp));
				
			matrix_load_rowvec(mxp, i, ne1, vp, ak+1);
			
			vector_housevec2(ne1, vp, hp, &beta);
			
			// ----- postmultiplication -----
			matrix_hh_postmul2(ne1, hp, beta, amxp, rmxp);
			matrix_copy_ab(amxp, rmxp);
			matrix_save_rowvec(mxp, i, ne1-1, hp+1, ak+2);
			
		}
		
		ne--, ak++, am--, an--;
	}


	free(vp), free(hp);
	matrix_dsc_fini(amxp);
	matrix_dsc_fini(rmxp);
}

/*
 *   Transform a matrix into upper triangular via Householder
 *   transformation.
 *
 */
void  matrix_tri_house(struct matrix_dsc_s *mxp, unsigned int k, unsigned int type)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, ne, ak, l, am, an;
	struct matrix_dsc_s *amxp, *rmxp;
	double **rpp, *rp, *vp, *hp, beta;
	
	m = MXROWS(mxp);
	n = MXCOLS(mxp);

	if ( type == MATVEC_XTYPE_COL ) {
		vp = MALLOC(m, double);
		hp = MALLOC(m, double);
		ne = m-k;
		l = n -1;
		ak = 0;
		am = m, an = n;
	} else {
		vp = MALLOC(n, double);
		hp = MALLOC(n, double);
		ne = n-k-1;
		l = m -1-1;
		ak = 2;
		am = m, an = n;  // <---???
	}
	
	// Create the "shadow" matrixe for A and R
	sprintf(id, "shadow of %s", MXID(mxp));
	amxp = matrix_create_rowp_only(id, m, n);
	
	sprintf(id, "resized %s", MXID(mxp));
	rmxp = matrix_create(id, m, n);
	
	for (i = 0; i < l; i++) {
		// 1. Resize the "shadow" matrixes.
		matrix_resize(amxp, am, an);
		matrix_resize(rmxp, am, an);
		
		// 2. Map the sub-matrix.  //+==+----- these are row and col. 
		matrix_submat_map(mxp, amxp, i, i); // offsets, respectively.

		// 3. Load the column or row vector to be HH transformed.
		if ( type == MATVEC_XTYPE_COL ) {
			matrix_load_colvec(mxp, i, ne, vp, ak);
		} else {
			matrix_load_rowvec(mxp, i, ne, vp, ak);
		}
		
		// 4. Compute the Householder vector.
		vector_housevec2(ne, vp, hp, &beta);
		sprintf(dbg, "beta = %g", beta); DBG(dbg); 
		vector_sprint("house vec => ", ne, hp);
		// 5. Householder premultiplication
		matrix_hh_premul(ne, hp, beta, amxp, rmxp);

		// 6. Copy the result in rmxp back to R via amxp.
		matrix_copy_ab(amxp, rmxp);
		
		matrix_print(mxp);
		ne--, ak++, am--, an--;
	}

	free(vp), free(hp);
	matrix_dsc_fini(amxp);
	matrix_dsc_fini(rmxp);
}

/*
 *  QR factorization via Householder transformation
 *     
 *     A = Q * R, where Q is an orthogonal matrix and 
 *                      R is an upper triangular matrix.
 *    
 *  Sizes:
 *     A :  m x n
 *     Q :  m x m
 *     R :  m x n
 *
 *  A is gradually transformed an upper triangle matrix thru a series
 *  steps.  R contains a copy of A.  "Live" modification is to be 
 *  performed on R as if A were being modified.
 *
 */

void  matrix_houseQR(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, nvec, ci, ne, k, mc, am, an, qm, qn;
	struct matrix_dsc_s *amxp, *qmxp, *rmxp; // "shadow" matrixes for A and R
	double **Arpp, *Arp, **Qrpp, *Qrp, **Rrpp, *Rrp, *hvp, *xp, beta;

	mc = MXSIZE_EQ(Amxp, Rmxp) && MXINDIM_EQ(Qmxp, Rmxp);
	if ( !mc ) {
		DBG("error: please check.");
		return;
	}
	
	matrix_copy_ab(Rmxp, Amxp);

	m = MXROWS(Amxp);    hvp = MALLOC(m, double); 
	n = MXCOLS(Amxp);     xp = MALLOC(m, double); 
	
	// Create the "shadow" matrixe for A and R
	sprintf(id, "shadow of %s", MXID(Amxp));
	amxp = matrix_create_rowp_only(id, m, n);
	
	sprintf(id, "resized %s", MXID(Amxp));
	rmxp = matrix_create(id, m, n);
	
	// ------- Compute R thru Householder transformation ---------

	ne = m, k = 0, am = m, an = n;
	for ( j = 0; j < n-1; j++ ) {  // loop thru all the columns of A
		ci = j;
		
		// 1. Resize the "shadow" matrixes.
		matrix_resize(amxp, am, an);
		matrix_resize(rmxp, am, an);

		// 2. Map the sub-matrix of R (which is a copy of A) to "a".
		matrix_submat_map(Rmxp, amxp, j, j);

		// 3. Load the column indexed by ci from A.
		matrix_load_colvec(Rmxp, ci, ne, xp, k);
		
		// 4. Compute the Householder vector.
		vector_housevec2(ne, xp, hvp, &beta);
		
		// 5. Householder premultiplication
		matrix_hh_premul(ne, hvp, beta, amxp, rmxp);

		// 6. Copy the result in rmxp back to R via amxp.
		matrix_copy_ab(amxp, rmxp);  // from B(the 2nd arg.) to A(the 1st arg)
		
		// 7. Optionally, save the Householder vector, excluding
		//    the first element, which is always normalized to 1.
		//
		if ( 1 ) {
			matrix_save_colvec(Rmxp, ci, ne-1, hvp+1, k+1);
		}

		ne--, k++, am--, an--;
	}
	
	qmxp = matrix_backaccumQR(Rmxp, MATVEC_XTYPE_COL);
	matrix_copy_ab(Qmxp, qmxp);
	
	// --- Compute the Q matrix from the Householder vectors 
	//     (the essential parts stored in R) ----
	
	matrix_set_uptriangular(Rmxp,  1e-15 );
	free(hvp);  matrix_dsc_fini(amxp);
	free(xp);   matrix_dsc_fini(rmxp);   matrix_dsc_fini(qmxp);
}

/*
 *   Given a Hessenberg matrix, do the QR factorization.
 *   Hmxp: in
 *   Qmxp: out, orthogonal
 *   Rmxp: out, still a Hessenberge matrix
 *
 *   Q * R * Q' = H
 *       +---- still a Hessenberge matrix
 */
void  matrix_hessQR(struct matrix_dsc_s *Hmxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp)
{
	unsigned int i, j, m, n, k;
	double *cp, *sp, a, b, na, nb, c, s;
	struct matrix_dsc_s *hmxp, *gmxp, *mxp;

	matrix_copy_ab(Rmxp, Hmxp);
	if ( Qmxp )
		matrix_set_id(Qmxp);
	
	m = MXROWS(Rmxp);   cp = MALLOC(m-1, double);
	n = MXCOLS(Rmxp);   sp = MALLOC(n-1, double);
	
	hmxp = matrix_create_rowp_only("shadow", 2, 2);
	gmxp = matrix_create("Givens", 2, 2);
	mxp  = matrix_create("temp", 2, 2);

	for ( i = 0; i < m-1; i++ ) {
		k = i;
		sprintf(dbg, "(%d:%d, %d:%d)", k, k+1, k, m-1); DBG(dbg); 
		a = matrix_get_value(Rmxp, k,   k);
		b = matrix_get_value(Rmxp, k+1, k);
		matrix_givens_coord(a, b, &na, &nb, &c, &s);
		cp[i] = c,  sp[i] = s;
		matrix_set_givens(gmxp, c, -s);  // This is actually the transposed Givens.
		matrix_resize(hmxp, 2, n-k);
		matrix_resize(mxp,  2, n-k);
		matrix_submat_map(Rmxp, hmxp, k, k);
		matrix_mul_abc(gmxp, hmxp, mxp);
		matrix_copy_ab(hmxp, mxp);
		
		if ( Qmxp ) {  // Accumulate Q if required
			matrix_resize(hmxp, m, 2);
			matrix_resize(mxp,  m, 2);
			matrix_submat_map(Qmxp, hmxp, 0, k);
			matrix_set_givens(gmxp, c, s);
			matrix_mul_abc(hmxp, gmxp, mxp);
			matrix_copy_ab(hmxp, mxp);
		}
	
	} 


	for ( i = 0; i < n-1; i++ ) {
		k = i;
		c = cp[i],  s = sp[i];
		matrix_set_givens(gmxp, c, s);  
		matrix_resize(hmxp, k+2, 2);
		matrix_resize(mxp,  k+2, 2);
		matrix_submat_map(Rmxp, hmxp, 0, k);
		matrix_mul_abc(hmxp, gmxp, mxp);
		matrix_copy_ab(hmxp, mxp);
	} 

	if ( 0 ) { 
		matrix_print(Rmxp);  matrix_print(Qmxp);
		matrix_mul_abaTd(Qmxp, Rmxp, Hmxp);
		matrix_print(Hmxp);
	}

	free(cp),  free(sp);
	matrix_dsc_fini(mxp);
	matrix_dsc_fini(hmxp);
	matrix_dsc_fini(gmxp);
}

/*
 * --- Set the values of the Givens rotation matrix. ---
 *     
 *     The 2x2 Givens rotation matrix is in the following format
 *
 *       +         +
 *       |  c   s  |
 *       | -s   c  |
 *       +         +
 *
 */

void matrix_set_givens(struct matrix_dsc_s *mxp, double c, double s)
{
	double **rpp, *rp, *r2p;
	
	rpp = MXRPP(mxp);
	rp  = rpp[0],    r2p = rpp[1];

	rp[0]  =  c,  rp[1]  = s;
	r2p[0] = -s,  r2p[1] = c;
}


/*
 *   Transpose the Given rotation matrix.
 *
 */
void matrix_givens_transpose(struct matrix_dsc_s *mxp)
{
	double **rpp, *rp, *r2p, t;
	
	rpp = MXRPP(mxp);
	rp  = rpp[0],    r2p = rpp[1];
	t   = rp[1],  rp[1] = r2p[0], r2p[0] = t;
}

/*
 *  QR factorization via Francis step, implicit double-shift
 *     
 *     A = Q * R, where Q is an orthogonal matrix and 
 *                      R is an upper triangular matrix.
 */
void  matrix_francisQR(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, nr, nc, k, q, mc;
	struct matrix_dsc_s *qmxp, *hmxp, *rmxp, *vmxp; // "shadow" matrixes for A and R
	double h11, h12, h21, h22, h32,  s, t, x, y, z,
		**Arpp, *Arp, **Qrpp, *Qrp, **Rrpp, *Rrp, *hp, *xp, beta;

	mc = MXSIZE_EQ(Amxp, Rmxp) && MXINDIM_EQ(Qmxp, Rmxp);
	if ( !mc ) {
		DBG("error: please check.");
		return;
	}
	
	matrix_copy_ab(Rmxp, Amxp);
	
	matrix_print(Rmxp);

	m = MXROWS(Rmxp);    hp = MALLOC(m, double); 
	n = MXCOLS(Rmxp);    xp = MALLOC(m, double); 
	
	// Create the "shadow" matrixe for R
	sprintf(id, "shadow of %s", MXID(Rmxp));
	rmxp = matrix_create_rowp_only(id, m, n);
	
	sprintf(id, "resized %s", MXID(rmxp));
	matrix_resize(rmxp, m, n);
	
	hmxp = matrix_create("temp H", 3, 3);

	vmxp = matrix_create("house vec", m-1, n-1);

	matrix_francisQR_init(Rmxp, xp);  // compute the first column of H 
	nr = 3;
	for ( i = 0; i < n-2; i++ ) {
		vector_housevec2(nr, xp, hp, &beta);
		
		// 1.  rows movement
		if ( i < 2 ) {
			q = 0,  nc = n, k = 0;
		} else {
			q = i,  nc--,   k++;
		}

		matrix_resize(rmxp, 3, nc);		
		matrix_resize(hmxp, 3, nc);		
		matrix_submat_map(Rmxp, rmxp, i, k);
		//DBG("row movement --> ");  matrix_print(rmxp);
		
		matrix_hh_premul(3, hp, beta, rmxp, hmxp);
		matrix_copy_ab(rmxp, hmxp);   // Save the results back to R

		// 2.  columns movement
		if ( nr == m ) {
			// do nothing
		} else {
			 nr++;
		}
		
		matrix_resize(rmxp, nr, 3);		
		matrix_resize(hmxp, nr, 3);		
		matrix_submat_map(Rmxp, rmxp, 0, i);
		//DBG("col movement ==> ");  matrix_print(rmxp);
		
		matrix_hh_postmul(3, hp, beta, rmxp, hmxp);
		matrix_copy_ab(rmxp, hmxp);   // Save the results back to R

		matrix_francisQR_column(Rmxp, xp, i);

		matrix_hhvec_print(m-i-1, hp, beta);
		//vector_sprint("house vec-->", m-i-1, hp);
		matrix_save_colvec(vmxp, i, m-i-2, hp+1, i+1);
	}
	
	//matrix_print(vmxp);

	vector_housevec2(2, xp, hp, &beta);
	
	if ( 1 ) {
		//matrix_print(Rmxp);
		matrix_hhvec_print(2, hp, beta);
	}
	matrix_save_colvec(vmxp, i, 1, hp+1, i);
	matrix_print(vmxp);
	
	// This is the trailing principal 2x3 sub-matrix.
	matrix_resize(rmxp, 2, 3);		
	matrix_resize(hmxp, 2, 3);		
	matrix_submat_map(Rmxp, rmxp, m-2, n-3);
	matrix_hh_premul(2, hp, beta, rmxp, hmxp);
	matrix_copy_ab(rmxp, hmxp);

	matrix_resize(rmxp, m, 2);		
	matrix_resize(hmxp, m, 2);		
	// This is the last 2 full columns.
	matrix_submat_map(Rmxp, rmxp, 0, n-2);
	matrix_hh_postmul(2, hp, beta, rmxp, hmxp);
	matrix_copy_ab(rmxp, hmxp);
	
	matrix_print(hmxp);

	if ( 1 )
		matrix_zero_eps(Rmxp, 1e-12);

	
	qmxp = matrix_backaccum_francisQ(vmxp);
	matrix_print(qmxp);  matrix_write_file("Q", qmxp);
	matrix_print(Rmxp);  matrix_write_file("R", Rmxp);
	matrix_print(Amxp);
	
	matrix_mul_abc(qmxp, Rmxp, Amxp);
	//DBG("should be the same ==");
	matrix_print(Amxp);

	free(hp);   matrix_dsc_fini(hmxp);   matrix_dsc_fini(qmxp); 
	free(xp);   matrix_dsc_fini(rmxp);   matrix_dsc_fini(vmxp);
}

/*
 *  Compute the initial column vector for the Francis QR step.
 *   
 *      xp is a 3-element column vector whose storage must have
 *      already been allocated upon entry of this routine.
 *
 */
void matrix_francisQR_init(struct matrix_dsc_s *Amxp, double *xp)
{
	unsigned int i, m, n;
	double s, t, h11, h12, h21, h22, h32,  x, y, z;

	m = MXROWS(Amxp),     n = MXCOLS(Amxp); 
	
	h11 = matrix_get_value(Amxp, m-2, n-2);
	h12 = matrix_get_value(Amxp, m-2, n-1);
	h21 = matrix_get_value(Amxp, m-1, n-2);
	h22 = matrix_get_value(Amxp, m-1, n-1);

	s = h11 + h22,    t = h11 * h22 - h12 * h21;
	
	if ( 0 ) {
		sprintf(dbg, "Trailing principal 2x2 submatrix of %s...", MXID(Amxp)); DBG(dbg);
		sprintf(dbg, "h(11, 12) = %-16.14g %-16.14g", h11, h12); DBG(dbg); 
		sprintf(dbg, "h(21, 22) = %-16.14g %-16.14g", h21, h22); DBG(dbg); 
	}

	h11 = matrix_get_value(Amxp, 0, 0);
	h12 = matrix_get_value(Amxp, 0, 1);
	h21 = matrix_get_value(Amxp, 1, 0);
	h22 = matrix_get_value(Amxp, 1, 1);
	
	if ( 0 ) {
		sprintf(dbg, "Leading principal 2x2 submatrix of %s...", MXID(Amxp)); DBG(dbg);
		sprintf(dbg, "h(11, 12) = %-16.14g %-16.14g", h11, h12); DBG(dbg); 
		sprintf(dbg, "h(21, 22) = %-16.14g %-16.14g", h21, h22); DBG(dbg); 
	}
	
	h32 = matrix_get_value(Amxp, 2, 1);
	if ( 0 ) {
		sprintf(dbg, "h32 = %-16.14g", h32);   DBG(dbg); 
	}

	i = 0;
	xp[i++] = h11 * h11 + h12 * h21 - s * h11 + t;
	xp[i++] = h21 * (h11 + h22 - s);
	xp[i++] = h21 * h32;

}

/*
 *  Compute the in-progress column vector for the Francis QR step.
 *   
 *      xp is a 3-element column vector whose storage must have
 *      already been allocated upon entry of this routine.
 *
 */
void matrix_francisQR_column(struct matrix_dsc_s *Amxp, double *xp, unsigned int k)
{
	unsigned int i, n; 
	 
	n = MXROWS(Amxp),   i = 0;

	xp[i++] = matrix_get_value(Amxp, k+1, k);
	xp[i++] = matrix_get_value(Amxp, k+2, k);
	if ( k+3 < n )
		xp[i++] = matrix_get_value(Amxp, k+3, k);

}

/*
 *   Compute the double shift parameters for Francis Algorithm
 *   (or Francis double-shift step).
 *
 */

void matrix_francis_shift(struct matrix_dsc_s *mxp, double *sum, double *prod)
{
	unsigned int i, j, m, n;
	double s, p, a11, a22, a12, a21;
	
	m = MXROWS(mxp),    n = MXCOLS(mxp);

	// These axx variables are for the trailing 2 x 2 submatrix of mxp.
	
	a11 = matrix_get_value(mxp, m-2, n-2);
	a12 = matrix_get_value(mxp, m-2, n-1);
	a21 = matrix_get_value(mxp, m-1, n-2);
	a22 = matrix_get_value(mxp, m-1, n-1);

	s = a11 + a22;
	p = a11 * a22 - a12 * a21;
	
	*sum = s,  *prod = p;
}

void matrix_francs_first_column(struct matrix_dsc_s *mxp)
{
	double sum, prod, a11, a22, a12, a21, a32;

	matrix_francis_shift(mxp, &sum, &prod);

	a11 = matrix_get_value(mxp, 0, 0);
	a12 = matrix_get_value(mxp, 0, 1);
	a21 = matrix_get_value(mxp, 1, 0);
	a22 = matrix_get_value(mxp, 1, 1);
	a32 = matrix_get_value(mxp, 2, 1);


}

/*
 *  This routine retrieves the Q or R, use the input argument "type"
 *  to select the choice.  The input matrix A is an 
 *  =========================
 *  Upper bidiagonal matrix A with essential parts of HH vectors
 *  ========================= 
 *  stored in the lower and upper parts of A, respectively.  
 *  Naturally, the lower part is column HH vectors while the
 *  upper part is row HH vectors, all essential parts. 
 *
 *  Back accumulating either Q or R depending "type", 
 *		MATVEC_XTYPE_COL  for Q
 *		MATVEC_XTYPE_ROW  for R
 *
 *  
 *  Notes: 1) Matrix A must be an upper diagonal matrix.  
 *         2) Householder vectors must be stored in the 
 *              specified arrangement.
 *         3) This routine won't check these requirements.  
 *
 */
struct matrix_dsc_s *matrix_backaccumQR(struct matrix_dsc_s *Amxp, unsigned int type)
{
	unsigned int i, j, m, n, l, ne, k, rc, qn;	
	struct matrix_dsc_s *qmxp, *rmxp, *Qmxp, *mxp = Amxp;
	double *hhp, *vp, beta;

	m = MXROWS(mxp), n = MXCOLS(mxp);
	
	if ( type == MATVEC_XTYPE_COL ) {
		Qmxp = matrix_create("Q", m, m);
		vp  = MALLOC(m, double);
		hhp = MALLOC(m, double);
		k = m - 1;  // offset is 1
		l = m - 1;
		qmxp = matrix_create_rowp_only("shadow matrix", m, m);
		if ( 0 ) {
			sprintf(dbg, "HH column vectors from A %s", 
				MXID(mxp)); DBG(dbg); 
		}
	} else {
		Qmxp = matrix_create("Q", n, n);
		vp  = MALLOC(n, double);
		hhp = MALLOC(n, double);
		k = n - 1;
		l = n - 2;    // row index, need to subtract another
		              // 1 to account for the upper diagonal
		
		qmxp = matrix_create_rowp_only("shadow matrix", n, n);
		if ( 0 ) {
			sprintf(dbg, "HH row vectors from A %s", 
				MXID(mxp)); DBG(dbg); 
		}
	}

	ne = 1, qn = 2 ; 
	vp[0] = 1.0;    // the first element is always 1 in a Householder vector 
	matrix_set_id(Qmxp);

	for ( i = l; i; i--) {

		// 1. Load the Householder vector
		if ( type == MATVEC_XTYPE_COL) {
			matrix_load_colvec(mxp, i-1, ne, vp+1, k);
		} else {
			matrix_load_rowvec(mxp, i-1, ne, vp+1, k);
		}
		
		// 2. Resize and map the "shadow" matrix onto Qmxp.
		matrix_resize(qmxp, qn, qn);

		// 3. Map the sub-matrix.
		matrix_submat_map(Qmxp, qmxp, k-1, k-1);

		// 4.  beta associated with the Householder vector
		beta = 2.0 / vectors_inner_prod(qn, vp, vp);

		// 5. Householder premultiplication
		matrix_hh_premul(qn, vp, beta, qmxp, qmxp);
		if ( 0 ) {
			matrix_hhvec_print(qn, vp, beta);
			matrix_print(qmxp);
		}

		ne++,  // no. of elements incremented
		k--,   // offset decremented
		qn++;  // qmxp size incremented by 1 x 1
	}

	matrix_dsc_fini(qmxp);
	free(vp);

	return Qmxp;
}

/** --- Recover Q matrix from its factored form from Francis QR  --- **/
struct matrix_dsc_s *matrix_backaccum_francisQ(struct matrix_dsc_s *mxp)
{
	unsigned int i, m, n, ci, ne, k;
	struct matrix_dsc_s *Qmxp, *qmxp;
	double *hp, beta;

	//Assumed is the HH vectors are stored in column vectors in the input matrix.

	m = MXROWS(mxp),  n = MXCOLS(mxp); 
	hp = MALLOC(m, double);
	
	Qmxp = matrix_create("Q from Francis QR step", m+1, n+1);
	qmxp = matrix_create_rowp_only("shadow", 2, 2);

	matrix_set_id(Qmxp);

	hp[0] = 1.0, ne = 1, k = m-1;
	for ( i = n; i; i--) {
		ci = i-1;
		if ( 0 ) {
			sprintf(dbg, "ci, ne, k = %d, %d, %d", ci, ne, k);
			DBG(dbg); 
		}
		matrix_load_colvec(mxp, ci, ne, hp+1, k);
		//vector_sprint("HH vec=> ", ne+1, hp);
		
		matrix_resize(qmxp, ne+1, ne+1);
		matrix_submat_map(Qmxp, qmxp, k, k);
	
		beta = 2.0 / vectors_inner_prod(ne+1, hp, hp);
		matrix_hh_premul(n, hp, beta, qmxp, qmxp);
		matrix_print(qmxp);
		if ( i < n ) {
			ne++;
			k--;
		}
	}

	free(hp);
	
	matrix_dsc_fini(qmxp);

	return Qmxp;
}

/*
 *
 *  QR factorization via Gives/plane rotation
 *     
 *     A = Q * R, where Q is an orthogonal matrix and 
 *                      R is an upper triangular matrix.
 *    
 *  Sizes:
 *     A :  m x n
 *     Q :  m x m
 *     R :  m x n
 *
 *  A is gradually transformed an upper triangle matrix thru a series
 *  steps.  R contains a copy of A.  "Live" modification is to be 
 *
 */

void  matrix_givensQR(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Rmxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, nvec, ne, k, kc, mc, am, an, qm, qn;
	struct matrix_dsc_s *gmxp, *qmxp, *rmxp;
	double a, b, c, s, newa, newb, **Arpp, *Arp, **Qrpp, *Qrp, **Rrpp, *Rrp, *hvp, *xp, beta;

	mc = MXSIZE_EQ(Amxp, Rmxp) && MXINDIM_EQ(Qmxp, Rmxp);
	if ( !mc ) {
		DBG("error: please check.");
		return;
	}
	
	matrix_copy_ab(Rmxp, Amxp);  matrix_update_idstring(Rmxp, "R");
	matrix_set_id(Qmxp);
	
	gmxp = matrix_create("Givens transformation",  2, 2);
	qmxp = matrix_create_rowp_only("shadow for Q", 6, 2);
	rmxp = matrix_create_rowp_only("shadow for R", 2, 2);

	m = MXROWS(Amxp);   
	n = MXCOLS(Amxp);   an = n;

	for ( j = 0; j < n-1; j++ ) { // loop thru all colums in Q
		
		for ( i = m-1; i>j; i-- ) { // loop thru all rows in Q
			k = i-1;
			//sprintf(dbg, "(%d; %d)", k, i); DBG(dbg); 
			matrix_resize(rmxp, 2, an);  // resize the shadow matrix
			matrix_submat_map(Rmxp, rmxp, k, j); // map the shadow onto R matrix
			//matrix_resize(qmxp, an, 2);
			matrix_submat_map(Qmxp, qmxp, 0, k);
			//matrix_print(rmxp);
			//matrix_print(qmxp);
			matrix_givensCS(gmxp, rmxp, qmxp);
		}

		an--; 
	}
	
	
	matrix_mul_abc(Qmxp, Rmxp, Amxp);
	
	matrix_dsc_fini(gmxp);
	matrix_dsc_fini(rmxp);
	matrix_dsc_fini(qmxp);
}


/*
 *   Compute the "rotated" Givens sub-matrix. 
 *   Then apply the Givens sub-matrix to R and Q.
 *   
 */ 
void  matrix_givensCS(struct matrix_dsc_s *gmxp, struct matrix_dsc_s *rmxp, struct matrix_dsc_s *qmxp) 
{
	unsigned int i, j;
	struct matrix_dsc_s *mxp;
	double **grpp, *grp, *g2rp, **rrpp, *rrp, **qrpp, *qrp, *q2rp, a, b, c, s;
	
	grpp = MXRPP(gmxp);  rrpp = MXRPP(rmxp);   qrpp = MXRPP(qmxp);
	rrp = rrpp[0];       a = rrp[0];           grp  = grpp[0];
	rrp = rrpp[1];       b = rrp[0];           g2rp = grpp[1];
	//matrix_print(rmxp);
	
	matrix_givens_rotate(a, b, &c, &s);
	 grp[0] = c,    grp[1] = -s;    // This is the transposed g.
	g2rp[0] = s,   g2rp[1] =  c;
	
	mxp = matrix_dup(rmxp);
	matrix_mul_abc(gmxp, mxp, rmxp); // Compute R.
	matrix_dsc_fini(mxp);

	 grp[0] =  c,   grp[1]  =  s;    // This is the g.
	g2rp[0] = -s,   g2rp[1] =  c;     

	matrix_mul_ab(qmxp, gmxp);       // Compute Q.
	
}

void  matrix_hessQZ(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Bmxp, 
	struct matrix_dsc_s *Qmxp, struct matrix_dsc_s *Zmxp)
{
	unsigned int i, j, m, n, r1, r2, c1, c2, flag_xpose, v;
	double a, b;
	struct matrix_dsc_s *mxp, *gmxp, *smxp, *tmxp, *cmxp, *qmxp, *rmxp;

	mxp  = Amxp;
	m = MXROWS(mxp);   v = 0;
	n = MXCOLS(mxp);   cmxp = matrix_diagram_create("QZ diagram", m, n);
	smxp = matrix_create_rowp_only("shadow", 2, 2);
	gmxp = matrix_create("Givens rotation", 2, 2);
	tmxp = matrix_create("temp", 2, 2);
	// 1. Do QR step to B, B = Q * R,  B = R, i.e. B is overwritten with R.
	qmxp = matrix_create("Q for B = Q * R", m, n);
	rmxp = matrix_create("R for B = Q * R", m, n);
	
	matrix_houseQR(Bmxp, qmxp, rmxp);   // compute B = Q * R
	
	// 2. Compute A = Q.T * A
	//
	matrix_write_file("Q", qmxp);
	matrix_mul_aTbb(qmxp, Amxp);   // Note: here 'a' is qmxp and 'b' is Amxp

	for ( j = 0; j < n-2; j++ ) {
		for ( i = n-1; i > j+1; i-- ) {
			
			// 1. The first round, the rows update
			a = matrix_get_value(Amxp, i-1, j);
			b = matrix_get_value(Amxp, i,   j);
			flag_xpose = 1;  // require transposed Givens rotation matrix
			matrix_givens_compute(gmxp, a, b, flag_xpose);

			r1 = i-1, r2 = i, c1 = j, c2 = n-1;   // A
			matrix_mulsub_ab(Amxp, gmxp, smxp, tmxp, r1, c1, MATVEC_XTYPE_ROW);
			if ( v )
				matrix_submat_diagram(mxp, r1, r2, c1, c2, 0);
			

			r1 = i-1, r2 = i, c1 = i-1, c2 = n-1; // B
			matrix_mulsub_ab(Bmxp, gmxp, smxp, tmxp, r1, c1, MATVEC_XTYPE_ROW);
			if ( v )
				matrix_submat_diagram(mxp, r1, r2, c1, c2, 0);
			
			matrix_xpose(gmxp);
			r1 = 0, r2 = m-1, c1 = i-1, c2 = i;   // Q
			matrix_mulsub_ab(Qmxp, gmxp, smxp, tmxp, r1, c1, MATVEC_XTYPE_COL);
			if ( v )
				matrix_submat_diagram(mxp, r1, r2, c1, c2, 0);
			

			// 2. The second round, the columns update

			a = -matrix_get_value(Bmxp, i, i);   // note the '-' sign
			b =  matrix_get_value(Bmxp, i, i-1);
			flag_xpose = 0;   // standard Givens rotation matrix, not the transpose
			matrix_givens_compute(gmxp, a, b, flag_xpose);

			r1 = 0, r2 = n-1, c1 = i-1, c2 = i;   // A
			matrix_mulsub_ab(Amxp, gmxp, smxp, tmxp, r1, c1, MATVEC_XTYPE_COL);
			if ( v )
				matrix_submat_diagram(mxp, r1, r2, c1, c2, 0);
			
			r1 = 0, r2 = i, c1 = i-1, c2 = i;     // B 
			matrix_mulsub_ab(Bmxp, gmxp, smxp, tmxp, r1, c1, MATVEC_XTYPE_COL);
			if ( v )
				matrix_submat_diagram(mxp, r1, r2, c1, c2, 0);
			
			r1 = i-1, r2 = i, c1 = j, c2 = n-1;   // Z
			matrix_mulsub_ab(Zmxp, gmxp, smxp, tmxp, r1, c1, MATVEC_XTYPE_COL);
			if ( v )
				matrix_submat_diagram(mxp, r1, r2, c1, c2, 0);
			
		}
	}
	
	matrix_dsc_fini(gmxp);   matrix_dsc_fini(qmxp);
	matrix_dsc_fini(tmxp);   matrix_dsc_fini(rmxp);
	matrix_dsc_fini(cmxp);
}


void  matrix_houseHessenberg(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Hmxp, struct matrix_dsc_s *Umxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, nvec, ci, ne, k, mc, am, an, qm, qn;
	struct matrix_dsc_s *qmxp, *amxp, *hmxp, *umxp; // "shadow" matrixes for A and R
	double **Arpp, *Arp, **Hrpp, *Hrp, **Urpp, *Urp, *hp, *xp, beta;

	mc = MXSIZE_EQ(Amxp, Hmxp) && MXINDIM_EQ(Hmxp, Umxp);
	if ( !mc ) {
		DBG("error: please check.");
		return;
	}
	
	matrix_copy_ab(Hmxp, Amxp);   // direct modification on Hmxp
	matrix_set_id(Umxp);
	//matrix_print(Hmxp);

	m = MXROWS(Amxp);     hp = MALLOC(m, double); 
	n = MXCOLS(Amxp);     xp = MALLOC(m, double); 

	k = 1, ne = n - 1;
	
	hmxp = matrix_create_rowp_only("shadow", 2, 2);  // m x n just
	amxp = matrix_create("temp", 2, 2);  // temp values, will be resized.      
	for ( j = 0; j < n-2; j++ ) {
		ci = j;
		
		// 1. Load the column indexed by ci from H.
		matrix_load_colvec(Hmxp, ci, ne, xp, k);
		vector_sprint("loaded column vec ==> ", ne, xp);
		
		// 2. Compute the Householder vector.
		vector_housevec2(ne, xp, hp, &beta);

		// 3. Map the sub-matrix to do HH premul.  
		matrix_resize(hmxp, ne, n-j);
		matrix_resize(amxp, ne, n-j);
		matrix_submat_map2(Hmxp, hmxp, k, k-1);
		//DBG(" pre-mult ==> "); matrix_print(hmxp);

		// 4. Householder premultiplication
		// matrix_hh_premul( ..., ..., ..., hmxp, hmxp); should work
		matrix_hh_premul(ne, hp, beta, hmxp, amxp);  
		matrix_copy_ab(hmxp, amxp);
		
		// 5. Map the sub-matrix to do HH postmul.  
		matrix_resize(hmxp, m, n-j-1);
		matrix_resize(amxp, m, n-j-1);
		matrix_submat_map2(Hmxp, hmxp, 0, k);
		//DBG(" post-mult ==> "); matrix_print(hmxp);

		// 6. Householder postmultiplication 
		matrix_hh_postmul(ne, hp, beta, hmxp, amxp);
		matrix_copy_ab(hmxp, amxp);

		// 7. Save the Householder vector.
		matrix_save_colvec(Hmxp, ci, ne-1, hp+1, k+1);
		
		//matrix_print(Hmxp);
		
		ne--, k++;
	}


	matrix_resize(hmxp, m-1, n-1);
	matrix_submat_map(Hmxp, hmxp, 1, 0);
	qmxp = matrix_backaccumQR(hmxp, MATVEC_XTYPE_COL);


	matrix_submat_map2(Umxp, hmxp, 1, 1);
	matrix_copy_ab(hmxp, qmxp);

	matrix_set_uptriangular(Hmxp, 1);
	
	free(hp);    matrix_dsc_fini(hmxp);
	free(xp);    matrix_dsc_fini(amxp);  matrix_dsc_fini(qmxp);
}

/*
 *  Compute the Schur decomposition via Hessenberg matrix
 *   Amxp : input matrix, which will be reduced to Hessenberg form first
 *   Smxp : output Schur triangular matrix
 *   Umxp : the projection (orthogonal) matrix
 */
void  matrix_houseSchur(struct matrix_dsc_s *Amxp, struct matrix_dsc_s *Smxp, struct matrix_dsc_s *Umxp)
{
	char id[BUF_LEN];
	unsigned int i, j, m, n, nvec, ci, ne, k, mc, am, an, qm, qn;
	struct matrix_dsc_s *qmxp, *amxp, *Hmxp, *umxp; // "shadow" matrixes for A and R
	double **Arpp, *Arp, **Srpp, *Srp, **Urpp, *Urp, mu;

	mc = MXSIZE_EQ(Amxp, Smxp) && MXINDIM_EQ(Smxp, Umxp);
	if ( !mc ) {
		DBG("error: please check.");
		return;
	}
	
	m = MXROWS(Amxp);
	Hmxp = matrix_dup2("Hessenberg form", Amxp);
	matrix_houseHessenberg(Amxp, Hmxp, Umxp);
	//matrix_print(Hmxp);
	
	n = 150; m--;
	for ( i = 0; i < n; i++ ) {
		//mu = matrix_get_value(Hmxp, m, m);
		//matrix_shift(Hmxp, -mu);
		matrix_houseQR(Hmxp, Smxp, Umxp);  // QR step
		matrix_mul_abc(Umxp, Smxp, Hmxp);  // put it back
		//matrix_shift(Hmxp, mu);
	}
	
	matrix_print(Smxp);
	matrix_print(Umxp);

	matrix_dsc_fini(Hmxp);   // release the intermediate Hess matrix
}


/** Make the mxp into an upper triangular.  That is the elements 
 *  under the diagonal are set to 0.  k can set no  
 *
 *      x   x   x   x   x  x       k=0 x x x x x            
 *  k=1 x   x   x   x   x  x           0 x x x x 
 *      0   x   x   x   x  x           0 0 x x x  
 *      0   0   x   x   x  x           0 0 0 x x  
 *      0   0   0   x   x  x           0 0 0 0 x k=0   
 *      0   0   0   0   x  x
 *                       k=1
 *
 *
 *  **/
void  matrix_set_uptriangular(struct matrix_dsc_s *mxp, unsigned int k)
{
	unsigned int i, j, m, n;
	double **rpp, *rp;
	
	m = MXROWS(mxp);  rpp = MXRPP(mxp);
	n = MXCOLS(mxp);

	for ( i = 1; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			if ( (i-k) > j )
				rp[j] = 0;
		}
	}
}

/** Make the mxp into an upper triangular.  That is the elements 
 *  under the diagonal are set to 0. **/
void  matrix_set_lowtriangular(struct matrix_dsc_s *mxp, unsigned int k)
{
	unsigned int i, j, m, n;
	double **rpp, *rp;
	
	m = MXROWS(mxp);  rpp = MXRPP(mxp);
	n = MXCOLS(mxp);

	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 1; j < n; j++ ) {
			if ( i < (j-1) )
				rp[j] = 0;
		}
	}
}

/*
 *   Set matrix mxp to a tridiagonal matrix.
 *
 */
void  matrix_set_tridiag(struct matrix_dsc_s *mxp)
{
	matrix_set_uptriangular(mxp,  1);
	matrix_set_lowtriangular(mxp, 1);
}

/*
 *   Shift the matrix:  mxp = mxp - mu * I.
 *
 */
void  matrix_shift(struct matrix_dsc_s *mxp, double mu)
{
	unsigned int i, n;
	double **rpp, *rp;

	n = MXROWS(mxp);   // must be a square matrix
	rpp = MXRPP(mxp);

	for ( i = 0; i < n; i++ ) {
		rp = rpp[i];
		rp[i] += mu;
	}
}

/** --- Zero all the elements in mxp under eps  --- **/
void matrix_zero_eps(struct matrix_dsc_s *mxp, double eps)
{
	unsigned int i, j, m, n;
	double **rpp, *rp;
	
	m = MXROWS(mxp), n = MXCOLS(mxp), rpp = MXRPP(mxp);
	
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			if ( fabs(rp[j]) < eps )
				rp[j] = 0.0;
		}
	}

}

/** --- Print the Househoulder vector --- **/
void matrix_hhvec_print(unsigned int n, double *vp, double beta)
{
    unsigned int i;
    char buf[64];

    sprintf(dbg, "House vector: beta = %g, (%d)", beta, n); 
    for ( i = 0; i < n; i++ ) {
        sprintf(buf, " %8.6g ", vp[i]);
        strcat(dbg, buf);
    } 
    DBG(dbg);
}


void  matrix_step_QR(struct matrix_dsc_s *mxp)
{
	unsigned int i, n;
	struct matrix_dsc_s *Amxp, *Qmxp, *Rmxp;
	double mu;
	
	Amxp = matrix_dup2("A", mxp);
	
	for ( i = 0; i < n; i++ ) {
		// Compute mu.
		matrix_shift(Amxp, -mu);   // shift before the QR step
		matrix_houseQR(Amxp, Qmxp, Rmxp);
		
		matrix_mul_abc(Rmxp, Qmxp, Amxp);
		matrix_shift(Amxp, mu);    // shift back after the R * Q
	}

	matrix_dsc_fini(Amxp);
	matrix_dsc_fini(Qmxp);
	matrix_dsc_fini(Rmxp);
}

/* 
 *    A.T * B * C = D
 */
void  matrix_mul_aTbcd(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp,
	struct matrix_dsc_s *cmxp, struct matrix_dsc_s *dmxp)
{
	unsigned int m, n;
	struct matrix_dsc_s *temp_mxp;
	
	m = MXCOLS(amxp), n = MXCOLS(bmxp);
	temp_mxp = matrix_create("temp storage", m, n);

	matrix_mul_aTbc(amxp, bmxp, temp_mxp);
	matrix_mul_abc(temp_mxp, cmxp, dmxp);

	matrix_dsc_fini(temp_mxp);
}

/* 
 *    A * B * C.T = D
 */
void  matrix_mul_abcTd(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp,
	struct matrix_dsc_s *cmxp, struct matrix_dsc_s *dmxp)
{
	unsigned int m, n;
	struct matrix_dsc_s *temp_mxp;
	
	m = MXROWS(amxp), n = MXCOLS(bmxp);
	temp_mxp = matrix_create("temp storage", m, n);

	matrix_mul_abc(amxp, bmxp, temp_mxp);
	matrix_mul_abTc(temp_mxp, cmxp, dmxp);

	matrix_dsc_fini(temp_mxp);
}

/* 
 *    A.T * B * A = D
 */
void  matrix_mul_aTbad(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *dmxp)
{
	matrix_mul_aTbcd(amxp, bmxp, amxp, dmxp);
}

/* 
 *    A * B * A.T = D
 */
void  matrix_mul_abaTd(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *dmxp)
{
	matrix_mul_abcTd(amxp, bmxp, amxp, dmxp);
}

struct matrix_dsc_s *vector2matrix(unsigned int ne, double *vp, 
	unsigned int nr, unsigned int nc, unsigned int row)
{
	unsigned int i, j, m, n, nn, k;
	struct matrix_dsc_s  *mxp;
	double **rpp, *rp;

	m = nr, n = nc,  nn = m * n, k = 0;
	mxp = matrix_create("vec2mat", m, n);

	if ( mxp ) {
		rpp = MXRPP(mxp);
		if ( row == MATVEC_XTYPE_ROW ) {
			// by rows
			for ( i = 0; i < m; i++ ) {
				rp = rpp[i];
				for ( j = 0; j < n; j++ ) {
					rp[j] = vp[k++];
				}
			}

		} else {
			// by columns, the FORTRAN convention
			for ( i = 0; i < m; i++ ) {
				for ( j = 0; j < n; j++ ) {
					rp = rpp[j];
					rp[i] = vp[k++];
				}
			}
		}
	} else {
		sprintf(dbg, "error to create matrix = %d", errno);
		DBG(dbg); 
	}

	return mxp;
}

/*
 *  Transfer data elements in a matrix to a vector. 
 *  
 *  row == MATVEC_XTYPE_COL, the default, column-major 
 *                           order as used in FORTRAN
 *
 *             +    +                 
 *             |1  3|  ==>   [1 2 3 4 ]'
 *             |2  4| 
 *             +    +
 *
 *  row == MATVEC_XTYPE_ROW, row-major order 
 *
 *             +    +
 *             |1  3|  ==>   [1 3 2 4 ]'
 *             |2  4| 
 *             +    +
 *
 */
double *matrix2vector(struct matrix_dsc_s *mxp, unsigned int *ne, unsigned int row)
{
	unsigned int i, j, m, n, nn, k;
	double **rpp, *rp, *vp;

	m = MXROWS(mxp), n = MXCOLS(mxp),  rpp = MXRPP(mxp);
	k = 0,  nn = m * n;
	
	vp = MALLOC(nn, double);
	if ( vp ) {
		if ( row == MATVEC_XTYPE_ROW ) {
			// by rows
			for ( i = 0; i < m; i++ ) {
				rp = rpp[i];
				for ( j = 0; j < n; j++ ) {
					vp[k++] = rp[j];
				}
			}
	
		} else {
			// by columns, the FORTRAN convention
			for ( i = 0; i < m; i++ ) {
				for ( j = 0; j < n; j++ ) {
					rp = rpp[j];
					vp[k++] = rp[i];
				}
			}
		}

		if ( ne )
			*ne = nn;
	
	} else {
		sprintf(dbg, "malloc() error: %d", errno); DBG(dbg); 
	}

	return vp;
}


void  matrix_submat_diagram(struct matrix_dsc_s *mxp, unsigned int rlo, 
	unsigned int rhi, unsigned int clo, unsigned int chi, unsigned int type)
{
	unsigned int i, j, mlo, mhi, nlo, nhi, flag = mxp->chr;
	
	if ( type ) {
		mlo = rlo - 1;   nlo = clo - 1;
		mhi = rhi - 1;	 nhi = chi - 1;
	} else {
		mlo = rlo;       nlo = clo;
		mhi = rhi;	     nhi = chi;
	}

	matrix_clear(mxp);

	for ( i = mlo; i < mhi+1; i++) {
		for ( j = nlo; j < nhi+1; j++) {
			matrix_set_value(mxp, i, j, 1);
		}
	}

	matrix_diagram_print(mxp);

	mxp->chr = flag;
}

/*
 *  This routine is not for general-purpose use in matrix multilication.
 *  It is written simply for the ease of programming in C for some
 *  complicated sub-matrix operations that are commonly arising in matrix
 *  orthogonal transformation.
 *
 *  A sub-matrix of a "bigger" matrix is either premultiplied or postmultiplied 
 *  by a Givens rotation matrix (gmxp).
 *  
 *  This version is limited to 2 rows if in column extension mode,
 *                      and 2 columns if in row extension mode.
 *
 *  amxp:  the "bigger" matrix whose sub-matrix is to multiply or be multiplied 
 *         by the Givens matrix.
 *  gmxp:  the Givens rotation matrix
 *  smxp:  the shadown matrix to be mapped onto and involved in the multiplication
 *  tmxp:  the matrix whose storage holds the temporary multiplication results
 *  moff:  the offset to row index of amxp, starting with 0
 *  noff:  the offset to column index of amxp, starting with 0
 *  type:  MATVEC_XTYPE_ROW: with the no. of rows in submatrix fixed,
 *                          rolumn extension  mode
 *         MATVEC_XTYPE_COL: with the no. of columns in submatrix fixed,
 *                          row extension mode
 *
 *         The two modes the routine operates 
 *
 *     MATVEC_XTYPE_ROW            MATVEC_XTYPE_COL 
 *   +--------------------+    +---------------------+
 *   |                    |    |                     |
 *   |                    |    |    (moff, noff)     |
 *   |    (moff, noff)    |    |         * X         |
 *   |         * X X X X X|    |         X X         | 
 *   |         X X X X X X|    |         X X         | 
 *   |                    |    |         X X         |
 *   |                    |    |         X X         |
 *   +--------------------+    +---------------------+
 *
 *   The '*' is the first 'X' in the 'X' submatrix whose position
 *   is specified by (moff, noff).
 *
 */
void matrix_mulsub_ab(struct matrix_dsc_s *amxp, struct matrix_dsc_s *gmxp, struct matrix_dsc_s *smxp, struct matrix_dsc_s *tmxp, unsigned int moff, unsigned int noff,  unsigned int type)
{
	unsigned int i, j, m, n, newm, newn;
	
	m = MXROWS(amxp),  n = MXCOLS(amxp);

	if  ( type == MATVEC_XTYPE_ROW ) {
		newm = 2,    newn = n - noff;
	} else {
		newm = m - moff,    newn = 2;
	}
	
	// 
	// These matrixes are extended to map until either the last 
	// column or the last row of the "bigger" amxp matrix.
	//
	matrix_resize(smxp, newm, newn);
	matrix_resize(tmxp, newm, newn);

	// The shadow matrix smxp is mapped onto the amxp matrix.
	matrix_submat_map(amxp, smxp, moff, noff); 
	
	// Now do either the premultiplication or the postmultiplication.
	if ( type == MATVEC_XTYPE_ROW ) {
		matrix_mul_abc(gmxp, smxp, tmxp);   // premul
	} else {
		matrix_mul_abc(smxp, gmxp, tmxp);   // postmul
	}

	matrix_copy_ab(smxp, tmxp);  // copy the results back to the "bigger" amxp
	
}

/*
 *
 *
 */
void matrix_mulsub_ab_cols( struct matrix_dsc_s *amxp, struct matrix_dsc_s *gmxp, struct matrix_dsc_s *smxp, struct matrix_dsc_s *tmxp, unsigned int doff)
{

	unsigned int i, j, m, n, moff, noff, newm, newn;
	
	m = MXROWS(amxp);
	n = MXCOLS(amxp);
	
	newm = doff+1, newn = 2;
	moff = 0,  noff = doff-1;
	matrix_resize(smxp, newm, newn);
	matrix_resize(tmxp, newm, newn);

	matrix_submat_map(amxp, smxp, moff, noff);
	matrix_print(smxp);
	//matrix_mul_abc(smxp, gmxp, tmxp);  //postmultiplication

	//matrix_copy(smxp, tmxp);
}

/**--------------------------  NLA end --------------------**/



/** --- dot product of two vectors --- **/
struct vector_op_s {
    unsigned int op: 4,
      num_matrixes : 5,
         column_vec: 1,
           vec_orth: 1,
                 x1: 1,
                 x2: 1,
                 xx: 1;

    struct matrix_dsc_s *mp, *xprod;
    struct matrix_op_s *mop;

    double prod, theta, dist;
};

unsigned int vectors_are_orth(struct vector_op_s *vecp);
unsigned int vector_is_orth(struct vector_op_s *vecp);
void vector_print(struct vector_op_s *vecp);
void vector_test(struct vector_op_s *vecp);

struct vector_op_s *vector_op_init2(char *datafilep)
{
    struct vector_op_s *vecp;
    struct matrix_dsc_s *mp;

    vecp = calloc(1, sizeof(struct vector_op_s) );
    vecp->mp = mp = matrix_init(datafilep);  // special case: must be in 2 x num_cols format

    if ( mp->num_rows != 2 ) {
        sprintf(dbg, " Error: the input data file must be in 2 x colsformat. %d", 
            mp->num_rows); DBG(dbg); 
        matrix_dsc_fini(mp);
        free(vecp);
        vecp = NULL;
    }

    return vecp;
}

struct vector_op_s *vector_op_init(char *datafilep)
{
    struct vector_op_s *vecp;

    vecp = calloc(1, sizeof(struct vector_op_s) );
    vecp->mop = matrix_op_init2(datafilep);  
    vecp->column_vec = 0;  // 0 : row vectors; 1: column vectors

    return vecp;
}

void vector_op_fini(struct vector_op_s *vecp)
{
    matrix_op_fini(vecp->mop);
    free(vecp);
}


void vector_op(struct vector_op_s *vecp)
{
    unsigned int j, m, op;
    struct matrix_dsc_s *mp;
    double *r1p, *r2p, prod, n1, n2;
    
    mp = vecp->mp;   m = mp->num_cols;
    r1p = mp->rowp[0];
    r2p = mp->rowp[1];
    
    op = vecp->op;
    switch ( op ) {

    case MATRIX_OP_DPD: 
        prod = n1 = n2 = 0.0;
        for ( j = 0; j < m; j++ ) {
            prod += r1p[j] * r2p[j];
            n1 += r1p[j] * r1p[j];
            n2 += r2p[j] * r2p[j];
        }

        vecp->prod = prod;
        vecp->theta = acos( prod / sqrt( n1 * n2 ) );
    break;
    
    case MATRIX_OP_XPD:  // Cross product
    break;

    case MATRIX_OP_DIST: // Distance from a point to a plane
        prod = n1 = n2 = 0.0;
        for ( j = 0; j < m-1; j++ ) {
            prod += r2p[j] * r2p[j];
            n1 += r1p[j] * r2p[j];
        }
        n1 += r2p[m-1];
        vecp->dist = fabs(n1) / sqrt(prod);
    break;

    default: 
        sprintf(dbg, " = %d", op); DBG(dbg); 
    break;
    } 
}

void vector_zero(unsigned int n, double *vp)
{
    unsigned int i;
    for ( i = 0; i < n; i++) {
        vp[i] = 0.0;
    }
}

struct matrix_dsc_s *vectors_outer_prod(unsigned int m, unsigned int n, double *mp, double *np, double scale)
{
	char id[BUF_LEN];
	unsigned int i, j;
	struct matrix_dsc_s *mxp;
	double **rpp, *rp;

	sprintf(id, "outer prod. of vectors %d x %d", m, n);
	mxp = matrix_create(id, m, n);
	rpp = MXRPP(mxp);

	for ( i = 0;  i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0;  j < n; j++ ) {
			rp[j] = scale * mp[i] * np[j];
		}
	}

	return mxp;
}

/* ==== compute the inner product of two vectors ==== */
double vectors_inner_prod(unsigned int n, double *v1p, double *v2p)
{
    unsigned int i;
    double prod = 0.0;

    for ( i = 0; i < n; i++ ) {
        prod += v1p[i] * v2p[i];
    }

    return prod;
}

double vector_norm_sqrt(unsigned int n, double *vp)
{
    unsigned int i;
    double dist;

    dist = 0.0;
    for ( i = 0; i < n; i++ ) {
        dist += vp[i] * vp[i];
    }
    
    return dist;
}

double statistics_mean(unsigned int n, double *vp)
{
    unsigned int i;
    double mean = 0.0;
    
    for ( i = 0; i < n; i++ ) {
        mean += vp[i];
    }

    return mean/n;
}

double statistics_std_deviation(unsigned int n, double *vp)
{
    unsigned int i;
    double mean, stdev;

    mean = statistics_mean(n, vp);

    for ( i = 0; i < n; i++ ) {
        stdev += pow(vp[i] - mean , 2.0);
    }

    stdev = sqrt(stdev /(n-1));

    return stdev;
}

/*
 *  Take the norm (distance) of a vector(v1, v2, ... vi, ... vn).   
 *  Degree   meaning         formulae
 * ------------------------------------------------------------------------
 *    0      infinity       max(|vi|)
 *    1      one-norm       sum(|v1| + |v2| + ... +|vi|+ ...+|vn|)
 *    2      two-norm       sqrt(v1*v1 + v2*v2 + ... + vi*vi + ...+vn *vn)
 *
 */
double vector_norm(unsigned int degree, unsigned int n, double *vp)
{
    double n2, norm, d;
    unsigned int i;
    
    switch(degree) {
    case 0 :
        norm = 0.0;
        for ( i = 0; i < n; i++ ) {
            d = fabs(vp[i]);
            if ( d > norm )
                norm = d;
        }
    break;

    case 1 :
        norm = 0.0;
        for ( i = 0; i < n; i++ ) {
            norm += fabs(vp[i]);
        }
    break;

    case 2 :
        n2 = vector_norm_sqrt(n, vp);
        norm = sqrt(n2);
    break;
    
    default:
        sprintf(dbg, " degree = %d; must be 0, 1 or 2.", degree); DBG(dbg);  exit(1);
    break;
    }

    return norm;
}

double vector_sum(unsigned int n, double *vp)
{
    double sum;
    unsigned int i;
    
    sum = 0;
    for ( i = 0; i < n; i++ ) {
        sum += vp[i];
    }
    
    return sum;
}

void vector_u_proj_on_v(unsigned int n, double *up, double *vp)
{
    unsigned int i;
    double norm2, prod; 
    
    //vector_print2(n, up); vector_print2(n, vp); DBG("xxxxx");
    prod = vectors_inner_prod(n, up, vp);
    norm2 = vector_norm_sqrt(n, vp);

    for ( i = 0; i < n; i++ ) {
        vp[i] = vp[i] * prod / norm2;
    }

    //sprintf(dbg, " prod = %g norm2 = %g", prod, norm2); DBG(dbg); 
    vector_print2(n, vp);
}

void vector_scale(unsigned int n, double *vp, double l)
{
    unsigned int i;

    for ( i = 0; i < n; i++ ) {
        vp[i] *= l;
    }
}

void vector_normalize(unsigned int n, double *vp)
{
    unsigned int i;
    double norm;

    norm = vector_norm(2, n, vp);
    for ( i = 0; i < n; i++ ) {
        vp[i] /= norm;
    }

}

void vector_reverse(unsigned int n, double *vp)
{
    unsigned int i, m;
    double t;

    m = n/2;
    for ( i = 0; i < m; i++ ) {
        t = vp[i];
        vp[i] = vp[n - i-1];
        vp[n-i-1] = t;
    }

}

void vector_circular(unsigned int n, double *vp, unsigned int cnt)
{
    unsigned int i, j, m;
    double t, *up;
    m = cnt % n;
    if ( m ) { 
        m = n/2;
        up = malloc(n * sizeof(double) );
        for ( i = 0; i < n; i++ ) {
            j = (i + cnt) %n;
            up[j] = vp[i];
        }

        memcpy(vp, up, n * sizeof(double));
    }
}

void vector_circular_back(unsigned int n, double *vp, unsigned int cnt)
{
    unsigned int m, i;
    i = cnt % n;
    m = n - i;
    
    if ( m)
        vector_circular(n, vp, m);

}

/** =======   vp += scalar   ===== **/
void vector_add_scalar(unsigned int n, double *vp, double scalar)
{
    unsigned int i;
    
    for ( i = 0; i < n; i++ ) {
        vp[i] += scalar;    
    }
}

/** =======   up += vp * l   ===== **/
void vector_add_scaled(unsigned int n, double *up, double *vp, double l)
{
    unsigned int i;
    
    for ( i = 0; i < n; i++ ) {
        up[i] += vp[i] * l;    
    }
}

/** =======   up += vp  ===== **/
void vector_u_plus_v(unsigned int n, double *up, double *vp)
{
    unsigned int i;
    
    for ( i = 0; i < n; i++ ) {
        up[i] += vp[i];    
    }
}

/** =======   up -= vp  ===== **/
void vector_u_minus_v(unsigned int n, double *up, double *vp)
{
    unsigned int i;
    
    for ( i = 0; i < n; i++ ) {
        up[i] -= vp[i];    
    }
}

/*
 *  Generate a random number vector.  The value of each element
 *  is between -1 and 1.
 */
void vector_rand01(unsigned int n, double *vp)
{
    unsigned int i;

    for ( i = 0; i < n; i++ ) {
        vp[i] = rand01();    
    }
}

void vector_clear(unsigned int n, double *vp)
{
    memset(vp, 0, sizeof(double) * n);
}

void vector_dup(unsigned int n, double *dstp, double *srcp)
{
    memcpy(dstp, srcp, sizeof(double) * n);
}

double *vector_dup2(unsigned int n, double *srcp)
{
	double *dstp = MALLOC(n, double);
    memcpy(dstp, srcp, sizeof(double) * n);
	
	return dstp;
}

void vector_dup_int(unsigned int n, unsigned int *dstp, unsigned int *srcp)
{
    memcpy(dstp, srcp, sizeof(unsigned int) * n);
}

void run_vector(void)
{
    char dp[] = "lp";
    struct vector_op_s *vecp;

    vecp = vector_op_init(dp);
    vecp->op = MATRIX_OP_XPD;  // Cross product
    vecp->op = MATRIX_OP_DPD;  // Dot product
    //vecp->op = MATRIX_OP_DIST;  // Distance
           /*      Data file format
            *      2  4
            *      1  1  1  0   <-- (point, last col must be 0.)
            *      2  2  2  2   <-- the plane in a*x + b*y + c*z + d = 0
            */
    if ( vecp ) {
        //vector_op(vecp);
        //vector_print(vecp);
        if ( vectors_are_orth(vecp) )
            DBG("orthogonal --> Y");
        else
            DBG("orthogonal --> N");
        
        vector_test(vecp);   // for test only
        
        vector_op_fini(vecp);
    }
    

}

void vector_test(struct vector_op_s *vecp)
{
    struct matrix_op_s *mop = vecp->mop;
    struct matrix_dsc_s *mxp;
    unsigned int i, j, m, n;
    double **rpp, *rp, *r2p;
    
    mxp = mop->mpp[0];
    matrix_print(mxp);
    n = mxp->num_cols;   // test with row vector
    rpp = mxp->rowp;
    rp = rpp[0];
    
    vector_print2(n, rp);
    //vector_reverse(n, rp);
    //DBG("After vector_reverse() ...");
    vector_circular(n, rp, 4);
    DBG("After vector_circular() ... forward by 4 pos");
    vector_print2(n, rp);
    vector_circular_back(n, rp, 1);
    DBG("After vector_circular() ... backward by 1 pos");
    vector_print2(n, rp);
    

    n = 8;  m = 4;   rp = calloc( n, sizeof(double) );
    for ( i = 0; i < m; i++ )
        rp[i] = (double)(i + 1);     // init it to [1, 2, 3, 0, 0, 0, 0, 0]
    
    vector_print2(n, rp);
    DBG("reverse the vector ...");
    vector_reverse(n, rp);
    vector_print2(n, rp);
    
    /*
    mxp = matrix_create("circular matrix 8x8", n, n);
    rpp = mxp->rowp;
    matrix_add_mxp(vecp->mop, mxp);
    for ( i = 0; i < mxp->num_rows; i++ ) {
        r2p = rpp[i];
        memcpy(r2p, rp, n * sizeof(double) );
        vector_circular(n, r2p, i);
    } */
    mxp = matrix_circular_create(n, rp);
    matrix_add_mxp(vecp->mop, mxp);  // PLEASE remember to add this mxp 
                                     // to mop for later memory release.
    
    matrix_print(mxp);
    
    /*
     * test
     *
     * void vector_col2row(double **colpp, double *rowp, unsigned int sz, unsigned int col_num)
     * void vector_row2col( double *rowp, double **colpp, unsigned int sz, unsigned int col_num)
     *
     *
     */
    DBG("Testing  vector_col2row() ...");
    n = 8;
    vector_col2row(mxp->rowp, rp, n, 0);
    vector_print2(n, rp);

    DBG("Testing  vector_col2row() ...");
    vector_row2col(rp, mxp->rowp, n, 1);
    vector_print2(n, rp);

    matrix_print(mxp);

    free(rp);
}

/**Given an n-element (double, row) vector, construct an nxn circular matrix  **/
struct matrix_dsc_s *matrix_circular_create(unsigned int n, double *vp)
{
    unsigned int i;
    double **rpp, *rp, *r2p;
    struct matrix_dsc_s *mxp;

    mxp = matrix_create("circular matrix 8x8", n, n);
    rpp = mxp->rowp;
    //matrix_add_mxp(vecp->mop, mxp);
    for ( i = 0; i < mxp->num_rows; i++ ) {
        r2p = rpp[i];
        memcpy(r2p, vp, n * sizeof(double) );
        vector_circular(n, r2p, i);
    }
    
    //matrix_xpose(mxp);  // Transposed matrix

    return mxp;
}

/** Convert a column vector to row vector, storage of the same size as that the column
 * vector must have been create upon entering this routine *
 *  Illustration:
 *      rp  ==>   1   2   3
 *
 *      mxp ===>  4   5   6
 *                5   6   7
 *                -9  7  -1
 *                    ^
 *                    * 
 *                    *********************************** 
 *      col_num   00  01 02                             +
 *      after calling vector_col2row( mxp->rowp, rp, 3, 1)
 *      rp ----->  5  6   7   i.e.  rp is the second column vector from mxp.
 * */


void vector_col2row(double **colpp, double *rowp, unsigned int sz, unsigned int col_num)
{
    unsigned int i, n;
    double **rpp, *rp;
    
    rpp = colpp;  n = sz;
    for ( i = 0; i < n; i++ ) {
        rp = rpp[i];
        rowp[i] = rp[col_num];
    }
    
}

/** Convert a column vector to row vector, storage of the same size as that the column
 * vector must have been create upon entering this routine *
 *  Illustration:
 *      rp  ==>   1   2   3
 *
 *      mxp ===>  4   5   6
 *                5   6   7
 *                -9  7  -1
 *                    ^
 *                    * 
 *                    +*********************************+ 
 *      col_num   00  01 02                             +
 *      after calling vector_row2col( rp, mxp->rowp, rp, 3, 1)
 *      mxp ===>  4   1   6
 *                5   2   7
 *                -9  3  -1
 *                    ^
 *                    * 
 *                    +*********************************+ 
 *      col_num   00  01 02                             +
 *      i.e. the second column vector of mxp is from 
 *        row vector rp (1, 2, 3)
 */


void vector_row2col( double *rowp, double **colpp, unsigned int sz, unsigned int col_num)
{
    unsigned int i, n;
    double **rpp, *rp;
    
    rpp = colpp;  n = sz;
    for ( i = 0; i < n; i++ ) {
        rp = rpp[i];
        rp[col_num] = rowp[i];
    }
}

/** Check if two vectors are orthogonal, vecp->prod == 0.0 **/
unsigned int vector_is_orth(struct vector_op_s *vecp)
{
    vecp->op = MATRIX_OP_DPD;  // Dot product
    vector_op(vecp);

    return vecp->prod == 0.0 ? 1 : 0;
}

/** === The vectors are stored in vecp->mp. === **/
unsigned int vectors_are_orth(struct vector_op_s *vecp)
{
    unsigned int i, j, n, m, orth = 1;
    struct matrix_op_s *mop;
    struct matrix_dsc_s *mxp;
    double *r1p, *r2p, prod;
    
    mop = vecp->mop; 
    sprintf(dbg, " mop->ia = %d", mop->ia); DBG(dbg); 
    mxp = mop->mpp[ mop->ia ];
    n = mxp->num_rows;
    m = mxp->num_cols;
    for ( i = 0; i < n-1; i++ ) {
        r1p = mxp->rowp[i];
        for ( j = i+1; j < n; j++ ) {
            r2p = mxp->rowp[j];
            prod = vectors_inner_prod(m, r1p, r2p);
            if ( prod != 0.0 ) {
                orth = 0;
                break;
            }
        }
        if ( !orth )
            break;
    }
    
    return orth;
}


void vector_print(struct vector_op_s *vecp)
{
    sprintf(dbg, "prod %g theta = %g dist = %g", vecp->prod, 
        vecp->theta * 180 / M_PI, vecp->dist ); DBG(dbg); 

}


void vector_sprint(char *msg, unsigned int n, double *vp)
{
    unsigned int i;
    char buf[64];

    sprintf(dbg, "%s ", msg); 
    for ( i = 0; i < n; i++ ) {
        sprintf(buf, " %8.5g ", vp[i]);
        strcat(dbg, buf);
    } 
    DBG(dbg);
}

void vector_write_file(char *file, unsigned int n, double *vp, unsigned int f)
{
    unsigned int i;
    char buf[64];
	
	FILE *fp;
	
	fp = fopen(file, "w");
	if ( fp ) {
		for ( i = 0; i < n; i++ ) {
			if ( f ) 
				fprintf(fp, " %g\n", vp[i]);
			else
				fprintf(fp, " %g ", vp[i]);
		}
		fclose(fp);
	} else {
		sprintf(dbg, "Error: open file \"%s\"", file); DBG(dbg); 
	}
}

void vector_print2(unsigned int n, double *vp)
{
    char buf[64];
    unsigned int i;
    
    dbg[0] = '\0';
    for ( i = 0; i < n; i++ ) {
        sprintf(buf, " %g ", vp[i]);
        strcat(dbg, buf);
    } 
    
    DBG(dbg);
}

void vector_print_int(unsigned int n, int *vp)
{
    char buf[64];
    unsigned int i;
    
    dbg[0] = '\0';
    for ( i = 0; i < n; i++ ) {
        sprintf(buf, " %d ", vp[i]);
        strcat(dbg, buf);
    } 
    
    DBG(dbg);
}

void vector_sprint_int(char *msg, unsigned int n, int *vp)
{
    char buf[64];
    unsigned int i;
    
	strcpy(dbg, msg);
    for ( i = 0; i < n; i++ ) {
        sprintf(buf, " %d ", vp[i]);
        strcat(dbg, buf);
    } 
    
    DBG(dbg);
}



/* ======= random number generation ======= */
double gen_sd_num(void)
{
    long   lseed = (long)time(NULL);
    static long l = 0;
    double s, f;
    
    l += lseed;
     
    f = ran1(); 
    s = 10.0 * f + 1.0;
    s = min(10.0, s);
    s = max(1.0, s);
    s = (double)((unsigned int) s);

    return s;
}

/** ----- Generate a random number between -1 and 1. -----**/
double rand01(void)
{
    long   lseed = (long)time(NULL);
    static long l = 0;
    double s, f;

    l += lseed;

    return ran1();
}

double ran1(void)
{
#define IA1    16807 
#define IM1    2147483647 
#define AM1    (1.0 / IM1) 
#define IQ1    127773 
#define IR1    2836 
#define NTAB1  32 
#define NDIV1  (1 + ( IM1 - 1) / NTAB1) 
#define EPS11  1.2e-7 
#define RNMX1  (1.0-EPS11) 
    int j; 
    long k; 
    long   *idum, lseed = (long)time(NULL);
    static long l = 0, iy = 0; 
    static long iv[NTAB1]; 
    double temp; 
    
    idum = &l;
    l += lseed;
    if (*idum <= 0 || !iy) { 
        if (-(*idum) < 1) 
            *idum=1; 
        else *idum = -(*idum); 
        for (j = NTAB1+7;j>=0;j--) { 
            k = (*idum) / IQ1; 
            *idum = IA1 * (*idum-k * IQ1) - IR1 * k; 
            if (*idum < 0) *idum += IM1; 
            if (j < NTAB1) iv[j] = *idum; 
        }
        iy=iv[0]; 
    }
    k = (*idum)/IQ1;
    *idum = IA1 * (*idum - k * IQ1)- IR1 * k; 
    if (*idum < 0) *idum += IM1; 
    j = iy / NDIV1; 
    iy = iv[j]; 
    iv[j] = *idum; 
    if ((temp = AM1 * iy ) > RNMX1) 
        return RNMX1; 
    else
        return temp;
#undef IA1    
#undef IM1  
#undef AM1    
#undef IQ1 
#undef IR1 
#undef NTAB1 
#undef NDIV1  
#undef EPS11  
#undef RNMX1  
}
/* ======= random number generation end ======= */




/** Check if a matrix as represented by the row vectors, are the orthonormal **/

unsigned int matrix_basis_orthonormal(struct matrix_dsc_s *mxp)
{
    unsigned int y;

    y = matrix_basis_orthogonal(mxp) && matrix_basis_normal(mxp); 

    return y;
}

unsigned int matrix_basis_orthogonal(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m, y = 1;
    double *rp, *rp2, prod;

    m = mxp->num_rows;
    n = mxp->num_cols;
    for ( i = 0; i < m-1; i++) {
        rp = mxp->rowp[i];
        for ( j = i+1; j < m; j++) {
            rp2 = mxp->rowp[j];
            prod = vectors_inner_prod(n, rp, rp2);
            if ( prod != 0.0 ) {
                y = 0;
                break;
            }
            if ( !y ) break;
        }
        if ( !y ) break;
    }

    return y;
}

/** Check if a matrix as represented by the row vectors, are the orthonormal **/
unsigned int matrix_basis_normal(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, n, m, y = 1;
    double *rp, prod;

    m = mxp->num_rows;
    n = mxp->num_cols;
    for ( i = 0; i < m; i++) {
        rp = mxp->rowp[i];
        prod = vectors_inner_prod(n, rp, rp);
        if ( prod != 1.0 ) {
            y = 0;
            break;
        }
    }

    return y;
}


/** ============== utils: permutation and combination ============= **/
unsigned int factorial(unsigned int n);
unsigned int total_num_perm(unsigned int n);
unsigned int total_num_comb(unsigned int n, unsigned int m);
void util_permcomb_fini(struct util_permcomb_s *pcp);
struct util_permcomb_s *util_permcomb_init(unsigned int comb_flag, unsigned int m, unsigned int n);
void  util_permcomb_ops(struct util_permcomb_s *pcp);
void  util_permcomb_print(struct util_permcomb_s *pcp);

void run_utils(void)
{
    struct util_permcomb_s *pcp;
    
    pcp = util_permcomb_init(1, 5, 3);
    
    util_permcomb_ops(pcp);
    //util_permcomb_print(pcp);

    util_permcomb_fini(pcp);
}

void  util_permcomb_ops(struct util_permcomb_s *pcp)
{
    unsigned int i, j, m, n, *brp, *rp;

    m = pcp->num_rows;
    n = pcp->num_cols;
    sprintf(dbg, "%s rows = %d, cols = %d ", pcp->comb? "combination":"permutation", m, n ); DBG(dbg); 
    
    while(1) {
    
    
    }
}


void  util_permcomb_print(struct util_permcomb_s *pcp)
{
    unsigned int i, j, m, n, *rp;

    m = pcp->num_rows;
    n = pcp->num_cols;
    sprintf(dbg, "%s rows = %d, cols = %d ", pcp->comb? "combination":"permutation", m, n ); DBG(dbg); 
    for ( i = 0; i < m; i++ ) {
        rp = pcp->gcpp[i];
        for ( j = 0; j < n; j++ ) {
            printf(" %d ", rp[j]);
        }
        printf("\n");
    }

}

struct util_permcomb_s *util_permcomb_init(unsigned int comb_flag, unsigned int m, unsigned int n)
{
    unsigned int i, j, *rp;
    struct util_permcomb_s *pcp;

    pcp = calloc(1, sizeof(struct util_permcomb_s ) );
    pcp->comb = comb_flag;
    
    if ( comb_flag ) {
        // combination generation
        pcp->num_rows = total_num_comb(m, n);
        pcp->num_cols = n;
    } else {
        // permutation generation
        pcp->num_rows = m;
        pcp->num_cols = m;
    }
    sprintf(dbg, " row = %d cols =%d", pcp->num_rows, pcp->num_cols); DBG(dbg); 
    pcp->gcpp = malloc( pcp->num_rows  * sizeof(unsigned int *) );

    for ( i = 0; i < pcp->num_rows; i++) {
        pcp->gcpp[i] = rp = malloc( pcp->num_cols * sizeof(unsigned int));
        for ( j = 0; j < pcp->num_cols; j++ ) {
            rp[j] = j;  // initialize the array
        }
    }

    return pcp;
}

void util_permcomb_fini(struct util_permcomb_s *pcp)
{
    unsigned int i, m;

    m = pcp->num_rows;
    for ( i = 0; i < m; i++ ) {
        free( pcp->gcpp[i] );
    }
    free(pcp->gcpp);
    free(pcp);
}

 

unsigned int factorial(unsigned int n)
{
    unsigned int i;

    if ( n == 0 ) {
        i = 1;
    } else {
        i = n * factorial(n-1);
    }    

    return i;
}

unsigned int total_num_comb(unsigned int n, unsigned int m)
{
    int num_comb, a, b, i;
    
    num_comb = 1;
    if ( n == m ) {
    
    } else {
        if ( n > m ) {
            a = n;
            b = m;
        } else {
            a = m;
            b = n;
        }
        for ( i = a; i > a-b+1; i--) {
            num_comb *= i;
        }
        num_comb /= factorial(a-b);
    }
    
    return num_comb;
}

unsigned int total_num_perm(unsigned int n)
{
    return factorial(n);
}


unsigned int util_comb_num(unsigned int m, unsigned int n)
{
    unsigned int i, j, nu, de, n1, k, num, *p;

    nu = 1;  k = n1 = n; 
    for ( i = m; n1 != 0 ; i--, n1--) {
        nu *= i;
        if ( !(nu % k) ) {
            nu /= k;
            k--;
        } 
    }

    de = 1;
    for ( i = 2; i <= k; i++ ) {
        de *= i;
    }
    
    num = nu / de;
    
    return num;
}

/**** compute the combination of n numbers out of m numbers ****/
struct matrix_dsc_s *util_comb_matrix(unsigned int m, unsigned int n)
{
    unsigned char buf[BUF_LEN];
    struct matrix_dsc_s *mxp;
    int i, j, k, m_n, num, pos;
    double **pp, *p, *p1, *p2;

    m_n = m - n;   
    num = util_comb_num(m, n);
    sprintf(buf, "combination list C(%d, %d) %d x %d matrix", m, n, num, n);
    mxp = matrix_create(buf, num, n);

    pp = mxp->rowp;
    p1 = pp[0];
    p2 = pp[1];
    for ( i = 0; i < n; i++ ) {
        p1[i] = p2[i] = i+1;  // initialize the first two rows
    }

    for (i = 1; i < num-1; i++ ) {
        p = pp[i-1];
        p1= pp[i];
        p2= pp[i+1];
        
        for ( j = n; j >= 1; j-- ) {
            
            //sprintf(dbg, " p[%d] = %d", j-1, p[j-1]); DBG(dbg); 

            if( p[j-1] < (m_n + j) ) {
                //sprintf(dbg, "m_n, j, m_n +j =  %d, %d, %d", m_n, j, m_n+j); DBG(dbg); 
                //sprintf(dbg, " updating p[%d] =%d  + 1", j-1, p1[j-1]); DBG(dbg); 
                (p1[j-1])++;
                pos = j-1;
                break;
            }
        }

        //sprintf(dbg, "pos = %d n = %d", pos, n); DBG(dbg); 
        for( k = pos+1; k < n; k++ ) {
            p1[k] = p1[k-1] + 1;
            //sprintf(dbg, " ==> updating k =%d;  p1[%d] = %d", k, k, p1[k] ); DBG(dbg); 
        }

        /*
         * We need to copy the current result to the next row of storage
         * because the current combination is the basis upon which next 
         * one is generated.
         */
        memcpy(p2, p1, n * sizeof(double) ); 
    }

    //-------- last line -------
    p1 = pp[num-1];
    for ( i = 0; i < n; i++ ) {
        p1[i] = m_n + i + 1;
    }

    return mxp;
}

unsigned int util_perm_num(unsigned int n)
{
    unsigned int pn = 1, i;

    for ( i = n; i > 0; i--) {
        pn *= i;
    }

    return pn;
}

struct perm_s {
    double **pp, *p;
    unsigned int level;
};

#define perm_swap(t, p, i, j) do { t = p[i]; p[i] = p[j]; p[j] = t;} while(0)

void perm(struct perm_s *pmp,  unsigned int n, unsigned int i)
{
    unsigned int j;
    double t;

    if ( i == n ) {
    //    for ( j = 0; j < n; j++ )
        memcpy(pmp->pp[pmp->level++], pmp->p, n * sizeof(double));    
    } else {
        for ( j = i; j < n; j++ ) {
            perm_swap(t, pmp->p, i, j);
            perm(pmp, n, i+1);
            perm_swap(t, pmp->p, i, j);
        }
    }

}

/**** compute the permutation of 1..n numbers ****/
struct matrix_dsc_s *util_perm_matrix(unsigned int n)
{
    unsigned char buf[BUF_LEN];
    struct matrix_dsc_s *mxp;
    int i, j, k, m_n, num, pos;
    double **pp, *p, *p1, *p2;
    struct perm_s pm;

    num = util_perm_num(n);
    sprintf(buf, "permutation list P(%d) %d x %d matrix", n, num, n);
    mxp = matrix_create(buf, num, n);
    
    // init the list entries
    pp = mxp->rowp;
    for ( i = 0; i < num; i++ ) {
        p = pp[i];
        for ( j = 0; j < n; j++) {
            p[j] = j+1;
        }
    }

    pm.pp = pp;
    pm.p = pp[0];
    pm.level = 0;

    perm(&pm, n, 0);

    sprintf(dbg, "pm.level = %d", pm.level); DBG(dbg); 


    return mxp;
}


/*
 *  Name:  unsigned int _ilog(unsigned int v)
 *  In:    v,  the value to be ilogged.
 *  Out:   value = log2(v)
 */
unsigned  int _ilog(unsigned int v)
{
  unsigned int ilv = 0;
  
  while(v) {
        ilv++;
        v >>= 1;
  }
  
   return(ilv);
}

/*
 *  Compute power(2, n)
 */
unsigned int _ipow(unsigned int n)
{
    unsigned int  ipv;
    
    ipv = 2 << n;
    return ipv;
}




/***==== Generate a random index points to the misclassified point. ==== ***/
unsigned int nau_rand_integer(unsigned int n)
{
    return (unsigned int)(ran1() * n);
}


/*
 *   Convert a 4-bit binary number abcd into hexdecimal number.
 *   Inputs a, b, c and d must be either 0 or 1. No check for
 *   inputs will be done. 
 */

unsigned int bintohex(unsigned int a, unsigned int b, unsigned int c, unsigned int d)
{
    unsigned int x;
    
	x  = a << 3; x += b << 2; x += c << 1; x += d;

    return x;
}


/** This routine computes the Binomial probability, given
 *  
 *   p:   probability of success
 *   1-p: probability of failure
 *   n :  # of independent trials
 *   k :  # of successes
 *
 * **/

double binomial_prob(unsigned int n, unsigned int k, double p)
{
    unsigned int i, j, m, nkfactorial, nk_permutation;
    double s, f, bp;  // variable bp for binomial probability

    m = n - k;   nk_permutation = 1;
    s = pow(p, k);
    f = pow(1-p, m);
    
    nkfactorial = factorial(m);
    for ( i = 0; i < m; i++) {
        nk_permutation *= (n-i);
    }

    bp = (double)nk_permutation/ (double)nkfactorial * s * f;
    
    return bp;
}


/*
 *  Read all the data matrices whose names are  listed in 
 *  file "mx_tocp" into  struct matrix_op_s {} and return
 *  the pointer to this structure.
 */

struct matrix_op_s *matrix_op_init2(const unsigned char *mx_tocp)
{
    FILE *fp;
    struct matrix_op_s *mop;
    struct matrix_dsc_s *mxp;
    unsigned int i, n;
    char  *mp, *p, *strp, *path, dp[BUF_LEN], buf[BUF_LEN], *s2="\t\r\n ", *sv,
		*fnamep = (char *)mx_tocp;
    size_t sz, len;
    unsigned int num_matrixes = 0;

    sz = nau_file_size(fnamep);
    sz++;

    mp = calloc(sz, sizeof(unsigned char));
	if ( !mp ) {
		DBG("error to allocate memory for mp list ...");
		return NULL;
	}

    mop = calloc(1, sizeof(struct matrix_op_s ) );
	
	if ( !mop ) {
		DBG("error to allocate memory for struct matrix_op_s{} ...");
		return NULL;
	}

	path = NULL;

    fp = fopen(fnamep, "r");
    while ( fgets(buf, sizeof(buf), fp) ) {
        if ( !(buf[0] == '\n' || buf[0] == '#' || buf[0] == '%' || buf[0] == '!' ) ) {
            //sprintf(dbg, " = %s len = %d", buf, strlen(buf)); DBG(dbg); 
			p = strtok_r(buf, s2, &sv);  // This is to truncate all
			if ( nau_file_isdir(p) ) {   // the white space preceeding 
				path = strdup(p);	     // or trailing the file names
				path = realloc(path, strlen(p) + 2 * sz);
			} else {
				num_matrixes++;
				strcat(mp, p);
				strcat(mp, "!");
			}
        }
    }
    
    mop->num_matrixes = num_matrixes;
    //mop->str = 1;
    mop->str = 0;
    mop->ia = 0;
    mop->new_xm = 1;  // default, when solving Ax = b,
                      // create new matrix x for storing the lution
    
    mop->mpp = malloc( mop->num_matrixes * sizeof(struct matrix_dsc_s *) );

    n = num_matrixes;
    strp = mp;

	if ( path ) {
		sprintf(dbg, "Reading data files in directory \"%s\"...", path );
	} else {
		sprintf(dbg, "Reading data files in current directory \"%s\"...",
		 getcwd(buf, sizeof(buf)));
	} 
	

    for ( i = 0; i < n; i++ ) {
        p = strstr(strp, "!");
        *p = '\0';   
        
		if ( path ) {
			sprintf(dp, "%s/%s", path, strp); 
		} else {
			sprintf(dp, "%s", strp);
		}	
        mop->mpp[i] = mxp = mop->str ? matrix_init_str(dp) : nau_binary_file(dp) 
				? matrix_init_bin(NULL, dp): matrix_init2(dp);

		if ( 0 ) {
        	sprintf(dbg, "Init data file %3d: %18s (%5d x %-4d)",
				i, strp, mxp->num_rows, mxp->num_cols); DBG(dbg); 
        }

        strp = p + sizeof(unsigned char) ;
    }

    free(mp);

	if ( path )
		free(path);

    return mop;
}


void  matrix_decompose_cholesky(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, m, n, di;
    unsigned char id[256];
    struct matrix_dsc_s *l_mxp, *lmxp, *dmxp;
    double *rp, **rpp, *lrp, **lrpp, sum2, *dp, a, d;

    m = MXROWS(mxp);
    n = MXCOLS(mxp);

    if ( m == n ) {
    
        sprintf(id, "Cholesky decomposition of matrix %s= L*D*L", mxp->id);
        lmxp = matrix_create(id, m, n);
        dmxp = matrix_create_eye(n);
        dp = MALLOC(2*n, double );
        
        // 1.  Compute the first row and first column of the Cholesky matrix
        //      Initially, just for programming convenience, L and L* data are stored
        //      the same cmxp.
        
        rpp = MXRPP(mxp);    rp  = rpp[0]; 
        lrpp = MXRPP(lmxp);  lrp = lrpp[0]; 
        dp[0] = lrp[0] = rp[0];
        for ( i = 1; i < n; i++ ) {  // first row
            lrp[i] = rp[i] / dp[0];
        }

        for ( i = 1; i < n; i++ ) {  // first column
            rp  =  rpp[i];
            lrp = lrpp[i];
            lrp[0] = rp[0] / dp[0];
        }
        
        // 2. Construct the cholesky L on the [1...n-1][1...n-1] matrix.
        for ( i = 1; i < n; i++ ) {
            rp = rpp[i];  lrp = lrpp[i];
            for ( j = 1; j < n; j++ ) {
                if ( i == j ) {
                    dp[i] = rp[i] - matrix_chodecomp_dotprod(lmxp, dp, i);
                    matrix_set_value(lmxp, i, i, dp[i]);
                } else {
                    d = matrix_chodecomp_offdiagprod(lmxp,dp, i, j);
                    a = rp[j];
                    di = min(i, j);
                    matrix_set_value(lmxp, i, j, (a-d)/dp[di]);
                }
            }
        }

        // 3. Transfer the move the U part of lmxp to l_mxp
        l_mxp = matrix_create("U part of Cholesky dompose", n, n);
        rpp = MXRPP(l_mxp);
        for (i = 0; i < n; i++ ) {
            lrp = lrpp[i];
            rp = rpp[i];
            for ( j = i; j < n;j++ ) {
                if ( j == i )
                    rp[j] = lrp[j] = 1;
                else {
                    rp[j] = lrp[j];
                    lrp[j] = 0;
                }
            } 
        }

        matrix_eye2diag(dmxp, dp);
        
        matrix_mul_ab(lmxp, dmxp);


        free(dp);
        matrix_dsc_fini(lmxp);
        matrix_dsc_fini(l_mxp);
        matrix_dsc_fini(dmxp);
    } else {
        sprintf(dbg, "Wrong matrix params: m must equal to n: now m= %d, n = %d", m, n); DBG(dbg); 
    }
}

double matrix_chodecomp_dotprod(struct matrix_dsc_s *lmxp, double *dp, unsigned int dn)
{
    unsigned int i, j, m, n, ri, ci;
    double rv, cv, sum = 0.0;

    ri = ci = dn;
    for ( i = 0; i < dn; i++) {
        rv = matrix_get_value(lmxp, ri, i);
        cv = matrix_get_value(lmxp, i, ci);
        sum += dp[i] * rv * cv; 
    }
    return sum;
}

double matrix_chodecomp_offdiagprod(struct matrix_dsc_s *lmxp, double *dp, unsigned int rn, unsigned int cn)
{
    unsigned int i, j, m, n;
    double rv, cv, sum = 0.0, **rpp, *rp;
    
    n = min(rn, cn);  
    for ( i = 0; i < n; i++) {
        rv = matrix_get_value(lmxp, rn, i);
        cv = matrix_get_value(lmxp, i, cn);
        sum += rv * cv * dp[i] ; 
    }
    return sum;
}

/*
 *   Set the diagnal entries of square matrix mxp according 
 *   the elements in vector dp.  The user is responsible for
 *   the matching of the corrsponding dimensions.
 *    
 *    If mxp is matrix is A, then this routine does
 *
 *         a_ii  = dp[i].
 *
 */
void  matrix_eye2diag(struct matrix_dsc_s *mxp, double *dp)
{
    unsigned int i, m, n;    
    double **rpp, *rp;
    
    m = MXROWS(mxp);
    n = MXCOLS(mxp);
    if ( n == m ) {
        rpp = MXRPP(mxp);
        for ( i = 0; i < n; i++ ) {
            rp = rpp[i];
            rp[i] = dp[i];
        }
    } else {
        sprintf(dbg, "m must equal n, now m = %d, n = %d", m, n); DBG(dbg); 
    }
}

/** Cholesky decomposition A = R * R.T **/
void  matrix_decompose_cholesky2(struct matrix_dsc_s *mxp)
{
	unsigned int i, j, k, m, n;
	struct matrix_dsc_s *lmxp;
	double **rpp, *rp, **lrpp, *lrp,*l_1rp, 
			a00, aii, aij, diff, r00, ri0, sqrtsum, dotprod;
	
	m = MXROWS(mxp);   rpp = MXRPP(mxp);
	n = MXCOLS(mxp);

	matrix_print(mxp);

	lmxp = matrix_create("Cholesky decomposition", m, n);
	lrpp = MXRPP(lmxp);
	
	rp = rpp[0];   lrp = lrpp[0];
	a00 = rp[0];
	if ( a00 < 0 ) {
		sprintf(dbg, "No decomposition a00 (=%g) <= 0", a00); DBG(dbg);
		return;
	}
	
	lrp[0] = a00 = sqrt(a00);

	for ( i = 1; i < m; i++ ) {
		lrp = lrpp[i];
		rp = rpp[i];
		lrp[0] = rp[0] / a00;
	}
	lrpp[1][1] = 3;

	for ( i = 2; i < m; i++ ) {
		lrp = lrpp[i];
		 rp = rpp[i];
		aii = rp[i];
		for ( j = 1; j < i; j++ ) {
			aij = rp[j];
			l_1rp = lrpp[j];
			dotprod = lrp[0] * l_1rp[0];
			sqrtsum = lrp[0] * lrp[0];
			sprintf(dbg, "(i, j) = %d %d", i, j); DBG(dbg); 
			for ( k = 1; k < j; k++ ) {
				sprintf(dbg, "k = %d", k); DBG(dbg); 
				dotprod += lrp[k] * l_1rp[k];
				lrp[k] = (aij - dotprod )/l_1rp[k];
				sqrtsum += lrp[k] * lrp[k];
			}
			sprintf(dbg, "aij, dotprod, sqrtsum = %g  %g  %g", aij, dotprod, sqrtsum); DBG(dbg); 
		}
		sprintf(dbg, "aii, sqrtsum = %g %g j =%d", aii, sqrtsum, j); DBG(dbg); 
		diff = aii - sqrtsum;
		lrp[j] = sqrt(diff);
	}

	
	matrix_print(lmxp);
	matrix_dsc_fini(lmxp);
}


/** Check if a matrix is strictly diagonally dominant. If yes, 
 *  we know it is nonsingular for sure.                    **/
unsigned int matrix_is_sdiagdom(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, m, n, y = 1;
    double **rpp, *rp, diagsum, offdiagsum;
    
    m = MXROWS(mxp);   diagsum = 0.0;
    n = MXCOLS(mxp);   offdiagsum = 0.0;
    rpp = MXRPP(mxp);
    for ( i = 0; i < m; i++ ) {
        rp = rpp[i];
        for ( j = 0; j < n; j++ ) {
            if ( i == j ) {
                diagsum += abs(rp[j]);
            } else {
                offdiagsum += abs(rp[j]);
            }
        }
        if ( diagsum < offdiagsum ) {
            y = 0;
            break;
        }
    }

    return y;
}


/* 
 *   Matrix Crout decomposition
 */

void  matrix_decompose_crout(struct matrix_dsc_s *mxp)
{
    unsigned int i, j, m, n, di;
    unsigned char id[256];
    struct matrix_dsc_s *l_mxp, *cmxp, *dmxp;
    double *rp, **rpp, *crp, **crpp, sum2, *dp, a, v, l, u;

    m = MXROWS(mxp);  rpp = MXRPP(mxp);
    n = MXCOLS(mxp);

    if ( m == n ) {
        cmxp = matrix_create("Crout decomp L", m, n);
        crpp = MXRPP(cmxp);
        // 1. Compute the first row.
        rp = rpp[0];       crp = crpp[0];
        crp[0] = rp[0];    u = crp[1] = rp[1]/rp[0];

        // 2. Compute the rest rows.
        for ( i = 1; i < m; i++ ) {
            rp = rpp[i];    a = rp[i];
            crp = crpp[i];  l = crp[i-1] = rp[i-1];
            crp[i] = (a - l * u);
            crp[i+1] = u = rp[i+1]/crp[i];
        }
    } else {
    
    }
    
    //matrix_print(cmxp);
    matrix_dsc_fini(cmxp);
}



/** =============== Nonlinear Programming ================ **/
#define NLNOPT_DCALG_HS   1  // 1952 Hestenes and Stielfel
#define NLNOPT_DCALG_FR   2  // 1964 Fletcher and Reeve
#define NLNOPT_DCALG_D    3  // 1967 Daniel 
#define NLNOPT_DCALG_PRP  4  // 1969 Polak and Ribere and Polyak
#define NLNOPT_DCALG_CD   5  // 1984 Conjugate Descent, Fletcher
#define NLNOPT_DCALG_LS   6  // 1987 Liu and Storey
#define NLNOPT_DCALG_DY   7  // 1999 Dai and Yuan
#define NLNOPT_DCALG_HZ   8  // 2005 Hager and Zhang
#define NLNOPT_DCALG_MRM  9  // 2015 Mohamed, Rivaie and Mustafa

#define NLNOPT_IP_QUAD1   1  // interpolation quadratic: two point + one gradient
#define NLNOPT_IP_QUAD2   2  // interploation quadratic: one point + two gradient
#define NLNOPT_IP_CUBIC   3  // interploatio cubic

struct nlnopt_s {
	unsigned int num_points, num_dims, i0, h;
	double rho, sigma, delta, tao, tao1, tao2, tao3, INT, EXT, 
		   alpha, beta, gamma,  f0, x0, h0, ak, ak1, fk, fk1, 
		   dfk, dfk1, *gk1p, *gkp, *dk0p, *xk0p,
		   *dkp, *dk1p, *xk1p, *xkp, *ykp, *fxp;  // function value
	struct matrix_op_s *mop;

	/**== for quadratic or cubic interpolation ==**/
	unsigned int qip1: 1,  // quadratic interpolation 1: a1, a2, f1, f2, df2 are known
	             qip2: 1,  // quadratic interpolation 2: a1, a2, f1 or f2, df1 and df2 are known 
				 cip : 1,  // cubic interpolation:  a1, a2, f1, f2, df1 and df2 are known
				 gk1 : 1,  // select gk1
				 alg : 6,  // select algorithm to compute beta
			  ip_type: 3,  // select algorithm to interplation next alpha
				 ax1 : 1;
	double ip_x[4], ip_y[4], ip_df[4];  // up to 4 points
};

struct nlnopt_s *nlnopt_init(struct na_params_s *p);
void  nlnopt_ops(struct nlnopt_s *nlp);
void  nlnopt_ops2(struct nlnopt_s *nlp);
void  nlnopt_ops3(struct nlnopt_s *nlp);
void  nlnopt_ops4(struct nlnopt_s *nlp);
void  nlnopt_fini(struct nlnopt_s *nlp);
void  nlnopt_ops_xk(struct nlnopt_s *nlp);
void  nlnopt_ops_dk(struct nlnopt_s *nlp);
void  nlnopt_ops_yk(struct nlnopt_s *nlp);
void  nlnopt_ops_alpha(struct nlnopt_s *nlp);
void  nlnopt_ops_beta(struct nlnopt_s *nlp);
void  nlnopt_ops_fxgdx(struct nlnopt_s *nlp);
unsigned int nlnopt_ops_moveok(struct nlnopt_s *nlp);
void  nlnopt_ops_next(struct nlnopt_s *nlp);
void  nlnopt_ops_backforward(struct nlnopt_s  *nlp);
void  nlnopt_ops_backforward_int(struct nlnopt_s  *nlp);
unsigned int nlnopt_num_dims(struct nlnopt_s *nlp);

double nlnopt_nextiter(struct nlnopt_s *nlp);
double nlnopt_func(unsigned int n, double *xp, double *dfp);
double nlnopt_fxdfx(struct nlnopt_s *nlp, double alpha, double *df);
double nlnopt_quad_itpl(double fa, double fb, double dfa, double dfb);
double nlnopt_quad_interpolation(struct nlnopt_s *nlp);
double nlnopt_cubic_ext(double z1, double z3, double f2, double f3, double s2, double s3, double limit);
double nlnopt_fxdfx1(struct nlnopt_s *nlp, double alpha, double *df);
double nlnopt_dk0(struct nlnopt_s *nlp);

double  nlnopt_new_x0(struct nlnopt_s *nlp, double *df);

void run_nlnopt(struct na_params_s *p)
{
	struct nlnopt_s *nlp;

	nlp = nlnopt_init(p);
	//nlnopt_ops(nlp);
	//nlnopt_ops2(nlp);
	//nlnopt_ops3(nlp);
	nlnopt_ops4(nlp);
	//nlnopt_ops_backforward(nlp);
	nlnopt_fini(nlp);
}

struct nlnopt_s *nlnopt_init(struct na_params_s *p)
{
	struct nlnopt_s *nlp;
	unsigned int i, n;
	//char *nlnp = "nln";
	char *nlnp = "ml";
	struct matrix_op_s *mop;
	struct matrix_dsc_s *mxp, *vmxp;
	double **vrpp, *vrp;
	
	nlp = CALLOC(1, struct nlnopt_s);
	nlp->mop = mop = matrix_op_init2(nlnp);
	mxp = MOP2MXP(mop, 0);
	nlp->num_points = n = 2;   
	nlp->num_dims = n;        

	nlp->tao  = 2;      nlp->rho  = 0.01;    nlp->sigma = 0.1;
	nlp->tao1 = 9.0;    nlp->tao2 = 0.1;     nlp->tao3  = 0.9;
	nlp->EXT  = 3.0;    nlp->INT  = 0.1;     nlp->h     = 1;
	nlp->h0   = 0.1;    nlp->x0   = -2;      nlp->f0    = 4;

	vmxp = matrix_create2(mop, "nln opt vectors", 10, n);
	vrpp = MXRPP(vmxp);  i = 0;
 
	nlp->xkp  = vrpp[i++];         nlp->gkp  = vrpp[i++];
	nlp->xk1p = vrpp[i++];         nlp->gk1p = vrpp[i++];
	nlp->dkp  = vrpp[i++];         nlp->ykp  = vrpp[i++];         
	nlp->xk0p = vrpp[i++];         nlp->dk0p = vrpp[i++];

	return nlp;
}

void nlnopt_fini(struct nlnopt_s *nlp)
{
	matrix_op_fini(nlp->mop);
	free(nlp);
}

void nlnopt_show_x(struct nlnopt_s *nlp)
{
	unsigned int i;
	double x1, x2, g1, g2, *xkp, *xk1p, *gkp, *gk1p, 
	     *dkp, fk, fk1, d1, d2, dfk, dfk1, ak, ak1, beta;

	i = 0;
	xkp  = nlp->xkp;     xk1p = nlp->xk1p;   
	gk1p = nlp->gk1p;    dkp  = nlp->dkp;
	x1   = xkp[i];       g1   = gk1p[i];    
	
	i++;
	x2 = xkp[i];         g2 = gk1p[i];
	
	 i = 0;
	 fk = nlp->fk;       d1 = dkp[i++];    dfk = nlp->dfk;
	fk1 = nlp->fk1;      d2 = dkp[i++];   dfk1 = nlp->dfk1;

	 ak = nlp->ak;      beta = nlp->beta;
	ak1 = nlp->ak1;

	sprintf(dbg, "x(%g, %g) --> x1(%g, %g)", x1, x2, xk1p[0], xk1p[1]); DBG(dbg); 
	sprintf(dbg, "g(%g, %g) (ak, ak1, beta)=(%g, %g, %g)",
		g1, g2, nlp->ak, nlp->ak1, beta); DBG(dbg); 
	sprintf(dbg, "f(%g, %g) d(%g, %g)", fk, fk1, d1, d2); DBG(dbg); 
	sprintf(dbg, "slope : df(%g, %g)", dfk, dfk1); DBG(dbg); 
}

void nlnopt_ops(struct nlnopt_s *nlp)
{
	unsigned int i, j, nl, ml, n, num_points;
	struct matrix_op_s *mop = MOP(nlp);
	struct matrix_dsc_s *mxp;
	double **rpp, *rp, x1, x2, *xk1p, *xkp, *gkp, *gk1p, *dkp;
	
	n = num_points = nlp->num_points;
	mxp = MOP2MXP(mop, 0);
	
	rpp = MXRPP(mxp);
	rp = rpp[0];
	

	/** 1. Init vector x **/
	xkp  = nlp->xkp;     gkp  = nlp->gkp;    dkp = nlp->dkp;
	xk1p = nlp->xk1p;    gk1p = nlp->gk1p;   
	i = 0;
	xkp[i] = xk1p[i] = -1.2;  i++;
	xkp[i] = xk1p[i] =  1;
	//xkp[i] = xk1p[i] = 2.2;  i++;
	//xkp[i] = xk1p[i] =  1;
	
	nlnopt_ops_fxgdx(nlp);  // nlnopt_ops_fxgdx() computes the  
	 nlp->fk = nlp->fk1;    // function value and its partial derivatives

	/** gradient vector d0 = -g0  **/
	for ( i = 0; i < n; i++ ) {
		gkp[i] =  gk1p[i];
		dkp[i] = -gk1p[i];
	}
	nlp->dfk = nlp->dfk1 = vectors_inner_prod(n, gkp, dkp);

	sprintf(dbg, "n = %d  dfk1 = %g", n, nlp->dfk1 ); DBG(dbg); 
	/** init alpha0 **/
	nlp->ak = 0;
	nlp->ak1 = 1;
	nlnopt_show_x(nlp);
	
    DBG("=========  Init finished, now iterating ... ========"); 
	
	nlp->ip_type = NLNOPT_IP_QUAD1;
	//nlp->alg = NLNOPT_DCALG_DY;
	nlp->alg = NLNOPT_DCALG_HZ;
	//nlp->alg = NLNOPT_DCALG_PRP;
	nl = 10;
	ml = 50;
	/** 2.  iterate to approximate the solution  **/
	for ( i = 0; i < nl; i++) {
		DBG("-------------------");
		nlnopt_show_x(nlp);
		DBG("*******************");
		
		for ( j = 0; j < ml; j++) {
			nlnopt_ops_xk(nlp);    // compute x_(k+1)
			nlnopt_ops_fxgdx(nlp); // compute f(x_(k+1)) and f'(x_(k+1))
			nlnopt_show_x(nlp);

			if ( nlnopt_ops_moveok(nlp) ) {
				DBG("OK!"); 
				break;
			} else {
				// compute a new alpha
				nlnopt_ops_alpha(nlp);
			}
		}
		
		nlnopt_ops_beta(nlp);
		nlnopt_ops_dk(nlp);
		nlnopt_ops_next(nlp);
	}
	
}

//------------------------------------------------------------

//double nlnopt_ops4_init(struct nlnopt_s *nlp, double alpha)
double nlnopt_ops4_init(struct nlnopt_s *nlp)
{
	unsigned int i, n;
	double *xkp, *dkp, *xk1p, *dk1p, f0, fmu, dfmu, mu;
	
	xkp = nlp->xkp;   xk1p = nlp->xk1p;
	dkp = nlp->dkp;   dk1p = nlp->dk1p;

	i = 0;   n = nlp->num_dims;
	//xkp[i++] = -2.0;     // Init X0 to be [-2; 2];
	//xkp[i++] =  2.0; 
	xkp[i++] = -1.0;     // Init X0 to be [-1; 3.5];
	xkp[i++] =  3.5; 

	// 1. Compute the function value at xkp and its partial 
	//     derivatives, stored in dkp.
	// f0 = nlnopt_func(n, xkp, dkp);
	// fmu = nlnopt_fxdfx1(nlp, alpha, &dfmu); // Compute for X1

}

void nlnopt_ops4(struct nlnopt_s *nlp)
{
	unsigned int i, j, k, l, nl, ml, n, m, num_dims, ok;
	struct matrix_op_s *mop = MOP(nlp);
	struct matrix_dsc_s *mxp;
	double **rpp, *rp, x1, x2, *xk1p, *xkp, *gkp, *gk1p, *dkp, 
		alpha, alpha_prev, f0, df0, fmu, dfmu, 
		fa, dfa, fb, dfb, f, f_prev, df, sigma, rho,
		a, b, tao1, tao2, tao3, fk, fk1, mu, len_ba, gap, t;
	
	// no need  because nnp->xkp, should already been set upon entry of this routine
	// nlnopt_ops4_init(nlp);

	k = 3;    m = 20;
	num_dims = n = nlp->num_dims;
	tao1 = nlp->tao1;      xkp = nlp->xkp;     xk1p = nlp->xk1p;
	tao2 = nlp->tao2;      dkp = nlp->dkp;    // dk1p = nlp->dk1p;
	tao3 = nlp->tao3;      gkp = nlp->gkp;
	
	// 1. Compute the function value at xkp and its partial 
	// derivatives, stored in dkp.   xkp is initialized, func val is f0, computed
	// gradient vector is in dkp.
	//
	
	alpha = 100.0;   rho = nlp->rho;  sigma = nlp->sigma;

	sprintf(dbg, " alpha = %g  rho = %g  sigma = %g", alpha, rho, sigma); DBG(dbg); 
	
	f0 = nlnopt_new_x0(nlp, &df0);
	if ( 0 ) { // Verify if the function computes the initial 
	           // values and partial derivatives correctly.
		vector_sprint("x0 ==> ", n, xkp);
		vector_sprint("d0 ==> ", n, dkp);
		sprintf(dbg,  "f0 = %g", f0); DBG(dbg); 
	}

	fmu = nlnopt_fxdfx1(nlp, alpha, &dfmu); // Compute for X1

	//f0 = nlnopt_fxdfx1(nlp, 0, &df0);     // Compute f(0) and f'(0) at X
	mu = (fmu - f0) / (rho * df0);
	sprintf(dbg, "f0 = %g  df0 = %g  fmu = %g dfmu = %g, mu = %g", 
		                      f0, df0, fmu, dfmu, mu); DBG(dbg); 

	mu = 100;
	a = 0, b = mu;
	alpha = b;
	alpha_prev = 0;
	f_prev = f0;
	
	k = 6;  m = 20;
	sprintf(dbg, "k = %d  n = %d m = %d", k, n, m); DBG(dbg); 

	for ( l = 0; l < k; l++ ) {  // # of iterations
		
		ok = 0;
		// --------------- 1. bracketing -----------------
		for ( i = 0; i < 6; i++ ) {
			// compute f and df at this X and alpha.
			f = nlnopt_fxdfx1(nlp, alpha, &df);  
			
			if ( f > (f0 + alpha * df0) || f > f_prev ) { 
				a = alpha_prev;  fa = fb; dfa = dfb;
				b = alpha;       fb = f;  dfb = df;
				break;
			}
			
			gap = 2.0 * alpha - alpha_prev;
			t = alpha - alpha_prev;
			if ( mu <= gap ) {
				alpha_prev = alpha;
				alpha = mu;
			} else {
				alpha_prev = alpha;
				f_prev = f;
				alpha = (gap + min(mu, alpha + tao1 *t) )/2.0;
			}
		}
		
		sprintf(dbg, "f0, df0 = %g, %g;  a, b = %g, %g", f0, df0, a, b); DBG(dbg); 
		fa = f0;   dfa = df0;
		sprintf(dbg, "fa, dfa = %g, %g;  fb, dfb = %g, %g", 
			fa, dfa, fb, dfb); DBG(dbg); 
		
		// --------------- 2. sectioning -------------------
		for ( j = 0; j < m; j++ ) { // max # of sectioning
			
			alpha = nlnopt_quad_itpl(a, b, dfa, dfb);
			len_ba = b - a;
		
			if ( ! ( ( a + tao2 * len_ba ) < alpha && ( alpha < (b - tao3 * len_ba) ) ) ) {
		 	
				alpha = (a + b)/2.0;	
			}

			f =  nlnopt_fxdfx1(nlp, alpha, &df);
			
			sprintf(dbg, "f, df = %g, %g", f, df); DBG(dbg); 

			if ( f > ( f0 + rho * alpha * df0) || f > fa ) {
				b = alpha;   // a stays the same.
				fb = f;      // for b, save the function value f
				dfb = df;    // as fb and slope df as dfb.
			} else {
				if ( fabs(df) <= (-sigma * df0 ) ) {
					ok = 1;    // success
					//f0 = f;    // save the function value and 
					//df0 = df;  // slope as the new start point.
				} else {
					if ( fabs(df) <= (- sigma * df0 )  ) {
						ok = 1;   // success
						//f0 = f;   // save the function value and
						//df0 = df; // slope as the new start point.
					} else {
						
						if ( len_ba * df >= 0 ) { // ... then, update b
							b = a;
							fb = fa;
							dfb = dfa;
						}	
						// update a
						a = alpha;
						fa = f;
						dfa = df;
					}	
				}
			}

			if ( ok ) {  // --- success ---
				sprintf(dbg, "-- ok -- f, df = %g, %g", f, df); DBG(dbg); 
				df = nlnopt_nextiter(nlp);	
				if ( 1 ) {
					mu = alpha * min(100, fabs(df/dfa) );
				} else {
					mu = 100;  // <--- need modification ---
				}
				f0 = fa = f;
				df0 = dfa = df;
				break;
			}
		
		} //<------ end of sectioning phase.
		
		if ( !ok ) {
			// line search fail, try a new alpha ...
			DBG("Line search failed ...");

			// f0 and df0 stay the same.
			fa = f0,    dfa = df0;
			f_prev = f0;
			alpha = mu;
			mu *= 2;
			fb = nlnopt_fxdfx1(nlp, alpha, &dfb);
		}

	} // <---------- end of iterations------------

}

/**------ Compute beta (various algorithms), new search direction dkp
 * and set the new X and its partial derivatives / gradient -----**/
double nlnopt_nextiter(struct nlnopt_s *nlp)
{
	unsigned int i, n = nlp->num_dims;
	double  beta, b_hs, b_pr, b_dy, s, ip_gk1yk, ip_gk, ip_gk1, ip_dkyk,
		*ykp, *dkp, *xkp, *xk1p, *gkp, *gk1p;
	
	ykp = nlp->ykp;     dkp  = nlp->dkp;
	xkp = nlp->xkp;     xk1p = nlp->xk1p;
	gkp = nlp->gkp;     gk1p = nlp->gk1p;

	for ( i = 0; i < n; i++ ) {
		//ykp[i] = gk1p[i] - gkp[i]; 
		ykp[i] = gkp[i] - gk1p[i];   // Note: in nn_funcval(), gk1p is the saved
	}    // gradient vector while gkp is the latest computed gradient vector.
	
	ip_gk1yk = vectors_inner_prod(n, gkp,  ykp);
	ip_gk1   = vectors_inner_prod(n, gkp,  gkp);
	ip_gk    = vectors_inner_prod(n, gk1p, gk1p);
	ip_dkyk  = vectors_inner_prod(n, dkp,  ykp);

	b_hs = ip_gk1yk / ip_dkyk;  // Hestenes and Stiefel
	b_pr = ip_gk1yk / ip_gk;    // Polak-Ribeiere-Polyak
	b_dy = ip_gk1   / ip_dkyk;  // Dai-Yuan 
	
	if ( 1 ) {
		beta = b_pr;  
	} else {
		beta = b_dy;  
	}

	sprintf(dbg, "beta = %g", beta); DBG(dbg); 

	for ( i = 0; i < n; i++ ) {
		dkp[i] = beta * dkp[i] - gkp[i];  // this is the new search direction.
	}
	
	s = vectors_inner_prod(n, dkp, gkp);
	sprintf(dbg, " s = %g", s); DBG(dbg); 
	if ( s > 0 ) {
		s = 0;
		for ( i = 0; i < n; i++ ) {
			s += gkp[i] * gkp[i];
			dkp[i] = -gkp[i];
			gkp[i] = -gkp[i];
		}
		s = -s;
	}

	/*------------- Copy xk1p back to xkp as new X vector --------  
	  -- Copy gkp to gk1p and xkp to xk1p as new starting point  -- */
	
	for ( i = 0; i < n; i++ ) {
		xk1p[i] = xkp[i];
		gk1p[i] = gkp[i];
	}
	
	return s;
}

/* 
 *
 *  Evaluate the function value and compute its partial derivatives.
 *
 */
double nlnopt_func(unsigned int n, double *xp, double *dfp)
{
	unsigned int i;
	double f, x1, x2, x3, x4;

	i = 0; 
	x1 = xp[i++];
	x2 = xp[i++];

	i = 0;
	if ( 0 ) {
		// ------------------- rosenbrock function ----------------
		//  a local minimizer x* = [1; 1]
		
		// --- 1. function value ------
		f  = 100.0 * pow(x2 -x1*x1, 2.0) + pow(1.0-x1, 2.0);

		// --- 2. comput the gradient vector, the partial derivatives  ----
		dfp[i++] = 200.0 * (x2 - x1*x1) * (-2.0 * x1) - 2.0 *(1.0 - x1);
		dfp[i++] = 200.0 * (x2 - x1*x1);

	} else if ( 0 ) {
		// --------- PMO, Prof R. Fletcher 2nd ed. Prob 2.2 ---------
		//  a local minimizer x* = [0.6959; -1.3479]
		//
		/** 1. compute the function value **/
		f = pow(x1, 4.0) + x1 * x2 +  pow(1.0 + x2, 2.0);

		/** 2. comput the gradient vector, i.e. the partial derivatives */
		dfp[i++] = 4.0 * pow(x1, 3.0) + x2;
		dfp[i++] = x1 + 2.0 *(1.0 + x2);
	
	} else if ( 1 ) {
		// --------- NA,  Burden 7th ed. ----------------------------
		//  a local minimizer x* = [-1.0472; 3.6652]
		//
		/** 1. compute the function value **/
		f = cos(x1 + x2) + sin(x1) +  cos(x2);

		/** 2. comput the gradient vector, i.e. the partial derivatives */
		dfp[i++] = -sin(x1 + x2) + cos(x1);
		dfp[i++] = -sin(x1 + x2) - sin(x2);
	}

	return f;
}

/** ===== Compute the function value and its partial derivatives ===== **/
//
//   In this case, xk1p is the X0 and
//                 xkp  is the new X. 
//
double nlnopt_fxdfx(struct nlnopt_s *nlp, double alpha, double *df)
{
	unsigned int i, n;
	double  *xkp, *xk1p, *dkp, *ykp, *gkp, x1, x2, x3, x4;

	//n = nlp->num_points;
	n = nlp->num_dims;   
	xkp  = nlp->xkp;      xk1p = nlp->xk1p;
	gkp  = nlp->gkp;      dkp  = nlp->dkp;
	
	for ( i = 0; i < n; i++  ) {
		xkp[i] =  xk1p[i] + alpha * dkp[i];
	}

	i = 0; 
	x1 = xkp[i++];
	x2 = xkp[i++];

	// compute the function value and its partial derivatives
	nlp->fk = nlnopt_func(n, xkp, gkp);

	// compute the slope (directional derivative) at f'(x_(k+1)) 
	nlp->dfk = *df = vectors_inner_prod(n, gkp, dkp);

	return nlp->fk;
}


double nlnopt_quad_itpl(double fa, double fb, double dfa, double dfb)
{
	double s, p;

	s = (fa - fb)/(dfa - dfb);
	p = fa - s * dfa;
	
	return p;
}

double nlnopt_quad_interpolation(struct nlnopt_s *nlp)
{
	double alpha,  ak, ak1, fk, fk1, dfk, dfk1, s;

	ak  = nlp->ak;     ak1  = nlp->ak1;
	fk  = nlp->fk;     fk1  = nlp->fk1;
	dfk = nlp->dfk;    dfk1 = nlp->dfk1;

	s = (ak - ak1) / (dfk - dfk1);
	alpha = nlp->alpha = ak1 - s * dfk1;

	return alpha;
}

double nlnopt_quad_fit(double z3, double f2, double f3, double s3)
{
	double z2;
	
	z2 = z3 - 0.5 * s3 * z3 / ( s3 + (f2 - f3)/z3 );

	return z2;
}

double nlnopt_cubic_fit(double z3, double f2, double f3, double s2, double s3)
{
	double t, A, B, z2;
	
	A = 6 * (f2 - f3)/z3 + 3 * (s2 + s3);
	B = 3 * (f3 - f2) - z3 * (s3 + 2 * s2);

	t = B * B - A * s2 * z3 * z3;
	if ( t < 0 ) {
		z2 = z3/2.0;
	} else {
		z2 = ( sqrt(t) - B) /A;
	}

	return z2;
}


double nlnopt_cubic_ext(double z1, double z3, double f2, double f3, double s2, double s3, double limit)
{
	double t, A, B, z2,  INT = 0.1, EXT = 3.0;
	
	A = 6 * (f2 - f3)/z3 + 3 * (s2 + s3);
	B = 3 * (f3 - f2) - z3 * (s3 + 2 * s2);

	t = B * B - A * s2 * z3 * z3;

	if ( t < 0 ) { // no real root for t
		if (limit < -0.5 ){
			z2 = z1 * (EXT - 1);
		} else { 
			z2 = (limit-z1)/2;
		}
	} else {
		z2 = -s2 *z3*z3/ ( B + sqrt(t) );

		if ( isnan(z2) || isinf(z2) || z2 < 0 ) {  
			if ( limit < -0.5 ) {
				z2 = z1 * (EXT - 1);                 
			} else { 
				z2 = (limit-z1)/2;        
			}
		} else if ( (limit > -0.5) && (z2+z1 > limit) ) {
			z2 = (limit-z1)/2;                             
			
		} else if ( (limit < -0.5) && (z2+z1 > z1*EXT) ) {
			z2 = z1*(EXT-1.0);                       
			
		} else if ( z2 < -z3*INT ) {
			z2 = -z3*INT;
		} else if ( (limit > -0.5) && (z2 < (limit-z1)*(1.0-INT)) ) { 
			z2 = (limit-z1)*(1.0-INT);
		}
	}
		
	return z2;
}


double nlnopt_alpha(double f1, double f2, double f3, double s2, double s3, double z3)
{
	double z2;

	if ( f2 > f1 ) {
		z2 = nlnopt_quad_fit(z3, f2, f3, s3);
	} else {
		z2 = nlnopt_cubic_fit(z3, f2, f3, s2, s3);
	}

	if ( isinf(z2) || isnan(z2)) {
		z2 = z3/2.0;
	}

	return z2;
}

/** ===== Compute the function value and its slope, not the vectors partial derivatives ===== **/
//
//   In this case, xkp is the X0 and
//                 xk1p is the new X. 
//   and search direction dkp is computed already.
//
double  nlnopt_fxdfx1(struct nlnopt_s *nlp, double alpha, double *df)
{
	unsigned int i, n;
	double  f, *xkp, *xk1p, *dkp, *ykp, *gk1p;

	n    = nlp->num_dims;  
	xkp  = nlp->xkp;
	xk1p = nlp->xk1p;
	gk1p = nlp->gk1p;
	dkp  = nlp->dkp;
	
	for ( i = 0; i < n; i++  ) {
		xk1p[i] =  xkp[i] + alpha * dkp[i];
	}

	// compute the function value and its partial derivatives
	f = nlnopt_func(n, xk1p, gk1p);

	// compute the slope (directional derivative) at f'(x_(k+1)) 
	*df = vectors_inner_prod(n, gk1p, dkp);
	
	return f; 
}

double  nlnopt_new_x0(struct nlnopt_s *nlp, double *df)
{
	unsigned int i, n;
	double  f, *xkp, *dkp, *gkp; 

	n    = nlp->num_dims;  
	xkp  = nlp->xkp;  // point X
	gkp  = nlp->gkp;  // gradient vector of X
	dkp  = nlp->dkp;  // search direction
	
	// compute the function value and its partial derivatives
	// given X vector, compute its function value and its partial derivatives
	f = nlnopt_func(n, xkp, gkp);  
	*df = -vectors_inner_prod(n, gkp, gkp);
	
	for ( i = 0; i < n; i++  ) {
		dkp[i] = -gkp[i];  // go in the opposite direction of it gradient
	}
	
	return f;
}

unsigned int nlnopt_num_dims(struct nlnopt_s *nlp)
{
	return nlp->num_dims;
}

/** == Nonlinear programming : compute f(x_(k+1)) and f'(x_(k+1))** 
 *    Results are in nlp->fk1 and nlp->dfk1, respectively.
 * == */
void nlnopt_ops_fxgdx(struct nlnopt_s *nlp)
{
	unsigned int  i, n;
	double  alpha, *xkp, *xk1p, *dkp, *ykp, *gk1p, x1, x2, x3, x4;
	
	n = nlp->num_dims;   alpha = nlp->alpha;
	xkp  = nlp->xkp;    // x (point) vector
	xk1p = nlp->xk1p;   // next x (point) vector
	gk1p = nlp->gk1p;   // gradient vector
	dkp  = nlp->dkp;    // (current) directional vector
	
	i  = 0;
	x1 = xk1p[i++];
	x2 = xk1p[i++];
	
	if ( 0 ) {
		// ------------------- rosenbrock function ----------------
		//  a local minimizer x* = [1; 1]
		//
		/** 1. compute the function value **/
		nlp->fk1 = 100.0 * pow(x2 -x1*x1, 2.0) + pow(1.0-x1, 2.0);

		/** 2. comput the gradient vector */
		i = 0;
		gk1p[i++] = 200.0 * (x2 - x1*x1) * (-2.0 * x1) - 2.0 *(1.0 - x1);
		gk1p[i++] = 200.0 * (x2 - x1*x1);

	} else {
		// ---------------- PMO 2nd ed. Prob 2.2 ------------------
		//  a local minimizer x* = [0.6959; -1.3479]
		//
		/** 1. compute the function value **/
		nlp->fk1 = pow(x1, 4.0) + x1 * x2 +  pow(1.0+x2, 2.0);

		/** 2. compute the gradient vector */
		i = 0;
		gk1p[i++] = 4.0 * pow(x1, 3.0) + x2;
		gk1p[i++] = x1 + 2.0 *(1.0 + x2);
	}
	
	/** 3. compute the slope (directional derivative) at f'(x_(k+1)) **/
	nlp->dfk1 = vectors_inner_prod(n, gk1p, dkp);

}


/**== Nonlinear programming: compute xk1 = xk + ak1 * dk == **/
void nlnopt_ops_xk(struct nlnopt_s *nlp)
{
	unsigned int i, n;
	double alpha, ak, ak1, *gk1p, *gkp, *xk1p, *xkp, *dk1p, *dkp;

	ak1  = nlp->ak1;
	xkp  = nlp->xkp;
	xk1p = nlp->xk1p;
	dkp  = nlp->dkp;
	n = nlp->num_dims ;
	for ( i = 0; i < n; i++ ) {
		xk1p[i] = xkp[i] + ak1 * dkp[i];
		//xk1p[i] = xk1p[i] + ak1 * dkp[i];
	}

	if ( 1 ) {
		vector_sprint("xk  ===> ", n, xkp);
		vector_sprint("xk1 ===> ", n, xk1p);
		vector_sprint("dk  ===> ", n, dkp);
		vector_sprint("gk1 ===> ", n, nlp->gk1p);
		sprintf(dbg, "ak1 =%g fk,dfk = %g, %g; fk1, dfk1 = %g, %g", 
			ak1, nlp->fk, nlp->dfk, nlp->fk1, nlp->dfk1 ); DBG(dbg); 
	}
}


/** Compute the initial xk1 vector, f value and gradient, thus set the 
 * dk0p = -gk1p and the corresponding directional derivative slope **/
void nlnopt_ops_dk0(struct nlnopt_s *nlp)
{
	unsigned int i, n;
	double *dkp, *gkp, *gk1p, *dk1p, dfk1;

	nlp->ak1 = 0;
	nlnopt_ops_xk(nlp);   // compute xk1 (= xk since ak1 = 0)
	nlnopt_ops_fxgdx(nlp);

	n = nlp->num_points;
	dkp  = nlp->dkp;
	dk1p = nlp->dk1p;
	gkp  = nlp->gkp;
	gk1p = nlp->gk1p;
	
	dfk1 = 0;
	for ( i = 0; i < n; i++ ) {
		gkp[i] = gk1p[i];
		dkp[i] = dk1p[i] = -gk1p[i];  // dk0 = -gk1
		dfk1  += gk1p[i] * dkp[i];    // g.T * d
	}
	
	if ( 1 ) {
		vector_sprint("dk0 ==> ", n, dkp);
		vector_sprint("gk0 ==> ", n, gk1p);
		sprintf(dbg, " slope0 = %g", dfk1); DBG(dbg); 
	}
	
	nlp->dfk1 = dfk1;
}

/** Nonlinear programming: compute dk, new search direction **/
void nlnopt_ops_dk(struct nlnopt_s *nlp)
{
	unsigned int i, n, num_points;
	double *dk1p, *gkp, *gk1p, *dkp, beta;
	
	n = nlp->num_points;
	dk1p = nlp->dk1p;
	dkp  = nlp->dkp;
	gk1p = nlp->gk1p;
	beta = nlp->beta;

	for ( i = 0; i < n; i++) {
		dk1p[i] = -gk1p[i] + beta * dkp[i];
	}

}

/*== Use backward-forward method to find an initial bracket. ==*/


/** Nonlinear programming: compute alpha **/
void nlnopt_ops_alpha(struct nlnopt_s *nlp)
{
	unsigned int i, n, num_points;
	double *xkp, *xk1p, *dkp, *dk1p, dfk, dfk1, alpha, r,
            da, df, // delta of alpha's and function value
			fk,fk1, ak, ak1, c1, c2, c3;

	n = nlp->num_points;
	dfk  = nlp->dfk;
	dfk1 = nlp->dfk1;
	ak   = nlp->ak;     fk  = nlp->fk;
	ak1  = nlp->ak1;    fk1 = nlp->fk1;

	da  = ak1 - ak;
	df  = fk1 - fk;
	switch ( nlp->ip_type ) {
	case NLNOPT_IP_QUAD1:
		alpha = ak - 0.5 * da / ( 1 - df /( da * dfk1 ) );
	break;

	case NLNOPT_IP_QUAD2:
		 alpha = ak - da * dfk1 / df;
	break;
	
	case NLNOPT_IP_CUBIC:
		c3 = dfk;
		c2 = 2 * df/( da * da) - (2 * dfk + dfk1)/da;
		c1 = (dfk + dfk1) / (da * da) - 2 * df/(da * da *da);
		r = c2 * c2 - 3 * c1 * c3;
		if ( r >=  0 ) 
			alpha = ak - c3 / ( c2 + sqrt(r) );
		else
			alpha = (ak + ak1)/2;
	break;

	default:  DBG("Sorry not implemented yet.");
	break;
	}

	//sprintf(dbg, "alpha = %g", alpha); DBG(dbg); 
	
	//nlp->ak = nlp->ak1;
	nlp->alpha = nlp->ak1 = alpha;
	
	/** === compute f(x + a*d) and f'(x+a*d) === **/
	/*
	xkp  = nlp->xkp;
	xk1p = nlp->xk1p;
	dk1p = nlp->dk1p;
	n = nlp->num_points ;
	for ( i = 0; i < n; i++ ) {
		xk1p[i] = xkp[i] + alpha * dk1p[i];
	}
	*/
}

unsigned int nlnopt_ops_moveok(struct nlnopt_s *nlp)
{
	 unsigned int y;
	 double fk, fk1, dfk, dfk1, rho, sigma, ak, ak1;
	
	ak   = nlp->ak;
	ak1  = nlp->ak1;
	fk   = nlp->fk;
	fk1  = nlp->fk1;
	dfk  = nlp->dfk;
	dfk1 = nlp->dfk1;
	rho  = nlp->rho;
	sigma = nlp->sigma;
	
	sprintf(dbg, "(fk1, fk) = (%g, %g) (dfk1,dfk) = (%g, %g) threahold fk = %g ",
		fk1, fk, dfk1, dfk, fk + rho * ak1 * dfk); DBG(dbg); 

	if  ( ( fk1 < (fk + rho * ak1 * dfk) ) &&  fabs(dfk1) < (-sigma * dfk) )
		y = 1;
	else
		y = 0;
	
	return y;
}

/** ---------- Nonlinear programming: compute beta -------- **/
void nlnopt_ops_beta(struct nlnopt_s *nlp)
{
	unsigned int i, n, num_points;
	double *gk1p, *gkp, *dkp, *ykp, beta, s, t,  gamma,
	       ip_gk1, ip_gk, ip_yk, ip_gk1yk, ip_dkgk, ip_dkyk;
	
	
	n = nlp->num_points;
	gk1p = nlp->gk1p;
	gkp  = nlp->gkp;
	dkp  = nlp->dkp;
	ykp  = nlp->ykp;
	
	ip_gk1   = vectors_inner_prod(n, gk1p, gk1p);
	ip_gk    = vectors_inner_prod(n, gkp,  gkp);
	
	nlp->gamma = gamma = nlp->alg == NLNOPT_DCALG_MRM ?  sqrt(ip_gk1 / ip_gk) : 1;
	nlnopt_ops_yk(nlp);  // compute the difference vector of two gradient vectors
	
	ip_gk1yk = vectors_inner_prod(n, gk1p, ykp);
	ip_dkyk  = vectors_inner_prod(n, dkp,  ykp);
	ip_dkgk  = vectors_inner_prod(n, dkp,  gkp);


	switch( nlp->alg ) {
	case NLNOPT_DCALG_HS:
		beta =  ip_gk1yk / ip_dkyk;
	break;

	case NLNOPT_DCALG_FR:
		beta = ip_gk1 / ip_gk;
	break;
	
	case NLNOPT_DCALG_D :
		DBG("Sorry, not implemented yet."); 
	break;
	
	case NLNOPT_DCALG_PRP:
		beta = ip_gk1yk / ip_gk;
	break;
	
	case NLNOPT_DCALG_CD:
		beta = -ip_gk1 / ip_dkgk;
	break;
	
	case NLNOPT_DCALG_LS:
		beta = -ip_gk1yk / ip_dkgk;
	break;
	
	case NLNOPT_DCALG_DY:
		beta = ip_gk1 / ip_dkyk;
	break;
	
	case NLNOPT_DCALG_HZ:
		ip_yk = vectors_inner_prod(n, ykp,  ykp);
		s = -2 * ip_yk / ip_dkyk;
		vector_add_scaled(n, ykp, dkp, s);
		beta =  vectors_inner_prod(n, ykp, gk1p) / ip_gk;
	break;
	
	case NLNOPT_DCALG_MRM:
		t = vectors_inner_prod(n, gk1p, dkp);
		beta = ip_gk1yk / ( ip_gk + fabs(t) );
	break;

	default: DBG("Sorry invalid choice of decent algorithm ...");
	break;
	}

	//nlp->beta = max(0, beta);
	nlp->beta = beta;
}

/** Nonlinear programming: init for the next iteration **/
void nlnopt_ops_next(struct nlnopt_s *nlp)
{
	double *p;
	
	//nlp->ak = 0;
	nlp->ak1 = max(2/nlp->rho, nlp->dfk/nlp->dfk1);
	nlp->fk = nlp->fk1;
	nlp->dfk = nlp->dfk1;
	p = nlp->xkp;  nlp->xkp = nlp->xk1p;  nlp->xk1p = p;
	p = nlp->dkp;  nlp->dkp = nlp->dk1p;  nlp->dk1p = p;
	
}

/** Nonlinear programming: compute yk = gk1 - gk **/
void nlnopt_ops_yk(struct nlnopt_s *nlp)
{
	unsigned int i, n, num_points;
	double *gk1p, *ykp, *gkp, gamma;
	
	n   = nlp->num_points;
	ykp = nlp->ykp;      gamma = nlp->gamma;
	gkp = nlp->gkp;
	gk1p = nlp->gk1p;   
	for ( i = 0; i < n; i++ ) {
		ykp[i] = gk1p[i] - gamma * gkp[i]; 
	}
}


/*== The C version of Rassmussen's minimize.m ==*/
void nlnopt_ops2(struct nlnopt_s *nlp)
{
	unsigned int i, j, k, l, m, n, success, M;
	double z1, z2, z3, d1, d2, d3, f1, f2, f3, s, t, 
	rho, sigma, alpha, ak, ak1, tao1, tao2, tao3, EXT, INT, INF,
	limit, *dkp, *dk1p, *gkp, *gk1p, *xk1p, *xkp, *p, 
	beta, A, B, ls_failed; 
	
	rho = nlp->rho;
	dkp = nlp->dkp;        gk1p = nlp->gk1p;
	gkp = nlp->gkp;        dk1p = nlp->dk1p;
	sigma = nlp->sigma;    tao1 = nlp->tao1;
	alpha = nlp->alpha;    tao2 = nlp->tao2;
	EXT = nlp->EXT;        tao3 = nlp->tao3;
	INT = nlp->INT;        M = 50;
	
	xk1p = nlp->xk1p; 
	xkp  = nlp->xkp ;

	
	sprintf(dbg, "rho = %g  sigma = %g alpha = %g", rho, sigma, alpha); DBG(dbg);
	sprintf(dbg, "tao1= %g  tao2  = %g tao3  = %g", tao1, tao2, tao3 ); DBG(dbg);
	sprintf(dbg, "EXT = %g  INT   = %g M     = %d", EXT, INT, M      ); DBG(dbg);
	//return ;

	k = 10; 
	success = 0;
	n = nlp->num_points;
	
	// Init the x0 and x1 points.
	i  = 0;
	xkp[i] = xk1p[i] =-1.2;   i++;
	xkp[i] = xk1p[i] = 1;
	
	nlp->alg = NLNOPT_DCALG_HZ; // beta algorithm

	nlnopt_ops_dk0(nlp);        // compute initial f and df
	f1 = nlp->fk  = nlp->fk1;   // f(x0)
	d1 = nlp->dfk = nlp->dfk1;  // -f'(x0)
	
	sprintf(dbg, " n = %d", n); DBG(dbg); 
	sprintf(dbg, "xk1[0] = %g  xk1[1] = %g", xk1p[0], xk1p[1]); DBG(dbg); 
	sprintf(dbg, "gk1[0] = %g  gk1[1] = %g", gk1p[0], gk1p[1]); DBG(dbg); 
	
	
	z1 = nlp->ak1 = 1/(1-d1);    // initial step, need to adjust
	/*
	nlnopt_ops_xk(nlp);
	nlnopt_ops_fxgdx(nlp);
	f2 = nlp->fk1; 
	d2 = nlp->dfk1;
	sprintf(dbg, "(f1,d1) = (%g, %g) (f2, d2) = (%g, %g) z1 = %g",
		f1, d1, f2, d2, z1); DBG(dbg); 
	*/

	for ( l = 0; l < k; l++ ) {
		success = 0; 
		limit = -1;
		M = 20;
		nlnopt_ops_xk(nlp);
		nlnopt_ops_fxgdx(nlp);
		f2 = nlp->fk1; 
		d2 = nlp->dfk1;
		f3 = f1;    d3 = d1;    z3 = -z1;
		sprintf(dbg, "(f1,d1) = (%g, %g) (f2, d2) = (%g, %g) z1 = %g",
			f1, d1, f2, d2, z1); DBG(dbg); 
	

		while ( 1 ) {
			while( ( (f2 > (f1 + z1 * rho * d1) ) || 
				( d2 > (-sigma * d1) ) ) && (M > 0)  ) {
				limit = z1;
				if ( f2 > f1 ) {
					z2 = z3 - (0.5 * d3 * z3 * z3) / ( d3 * z3 + f2 - f3 );

				} else {
					A = 6 * ( f2 -f3 ) / z3 + 3 * (d2 + d3);
					B = 3 * ( f3 -f2 ) - z3 * ( 2 * d2 + d3);
					t = B* B - A * d2 * z3 * z3;
					if ( t > 0 )
						z2 = (sqrt(t) - B) /A ;
					else
						z2 = z3/2;
				}
				
				if ( isnan(z2) || isinf(z2) ) {
					z2 = z3 / 2;
				}	
				
				z2 = max(min(z2, tao1 * z3), tao2 * z3);

				z1 += z2;    // tighten the bracket
				nlp->ak1 = z2;
				nlnopt_ops_xk(nlp);
				nlnopt_ops_fxgdx(nlp);
				M--;

				f2 = nlp->fk1;
				d2 = nlp->dfk1;
				sprintf(dbg, "f2, d2 = %g, %g", f2, d2); DBG(dbg); 
				
				z3 -= z2;  // tighten the bracket
				sprintf(dbg, "z1, z2, z3 = %g, %g, %g", z1, z2, z3); DBG(dbg); 
			}

			if ( f2 > (f1 + z1 * rho * d1) || (d2 > -sigma * d1)  ) {
				break;
			} else if ( d2 > sigma * d1 ) {
				success = 1;
				nlp->fk = f2;
				nlp->dfk = d2;
				break;
			} else if ( M == 0 ) {
				break;
			}

			DBG("Line search failed A:");
			A = 6 * ( f2 - f3 ) / z3 + 3 * (d2 + d3);
			B = 3 * ( f3 - f2 ) -  z3 * (2 *d2 + d3);
			t = B * B - A * d2 * z3 * z3;
			if ( t > 0 )
				z2 = -d2 * z3 * z3 / ( B + sqrt(t));
			else {
				/*
				if ( limit < -0.5 ) {
					z2 = z1 * (EXT - 1.0);
				} else {
					z2 = (limit - z1 )/2.0;
				} */
			}

			if (  isnan(z2) || isinf(z2) || z2 < 0 ) {
				if ( limit < -0.5 ) {
					z2 = z1 * (EXT - 1.0);
				} else {
					z2 = (limit - z1 )/2.0;
				}
			
			} else if ( (limit > -0.5) && ( (z1 + z2) > limit) ) {
				z2 = (limit - z1)/2.0;	
			
			} else if ( (limit < -0.5) && ( (z1 + z2) > z1 * EXT) ) {
				z2 = z1 * (EXT - 1.0);	
			
			} else if ( z2 < (-z3 * INT) ) {
				z2 = -z3 * INT;
			} else if ( (limit > -0.5) && ( z2 < (limit-z1)*(1.0 - INT) ) ) {
				z2 = (limit -z1)*(1.0 - INT);
			}
			
			f3 = f2; d3 = d2; z3 = -z2;
			z1 = z1 + z2;
			nlp->ak1 = z2;
			nlnopt_ops_xk(nlp);
			nlnopt_ops_fxgdx(nlp);
			M = M -1;
			f2 = nlp->fk1;
			d2 = nlp->dfk1;
		}

		if ( success ) {
			//t = nlp->dfk1;   nlp->dfk1 = nlp->dfk;    nlp->dfk = t;
			nlp->fk = f1 = f2;
			
			nlnopt_ops_beta(nlp);   // compute beta 
			beta = nlp->beta; 
			sprintf(dbg, "OK: xk1p[0], xk1p[1] = %g, %g; gk1p[0], gk1p[1] = %g, %g, beta = %g d2 = %g",
				xk1p[0], xk1p[1], gk1p[0], gk1p[1], beta, d2); DBG(dbg); 
			nlnopt_ops_dk(nlp);     // compute (new) dk1
			nlp->dfk = d2 = vectors_inner_prod(n, dk1p, gk1p);	
			sprintf(dbg, " d2 = %g", d2); DBG(dbg); 
			
			if ( d2 > 0 ) {
				d2 = 0;
				for ( i = 0; i < n; i++ ) {
					dk1p[i] = -gk1p[i];
					d2 += dk1p[i] * gk1p[i]; // d2 should be negatvie since -gk1' * gk1
				}
				nlp->dfk = d2;
			}
			sprintf(dbg, " d2 = %g", d2); DBG(dbg); 
			/**Set up the new search direction vector dkp */
			for ( i = 0; i < n; i++ ) {
				dkp[i] = dk1p[i];
				xkp[i] = xk1p[i];
				gkp[i] = gk1p[i];
			}

			//p = xk1p;  xk1p = xkp;  xkp = p;  // save xkp = xk1p;
			//p = gk1p;  gk1p = gkp;  gkp = p;  //      gkp = gk1p;
			z1 *= min(1/rho, d1/(d2-2.2251e-308));
			nlp->ak1 = z1;
			d1 = d2;
			ls_failed = 0;
		} else {
			
			if ( ls_failed ) {
				DBG("failed...");
				break;
			}
			/** Need to set up a new dkp ??? **/		
		
		}
	
	}

}

/** =================================== **/

/** Here is the test function: 
 *  f(x) = x1 ** 4 + x1 * x2 + (1 + x2 ) ** 2
 *
 *  df(x)/d(x1) = 4 * x1**3 + x2
 *  df(x)/d(x2) = x1 + 2 *(1+ x2)
 *
 */
double nlnopt_fx(double n, double *xp)
{
	double f, x1, x2, x3;
	
	x1 = xp[0];   x2 = xp[1]; x3 = xp[2];
    f = pow(x1, 4) + x1 * x2 + pow(1 + x2, 2);

	return f;
}

/** compute the partial derivative vector
 *  df(x)/d(x1) = 4 * x1**3 + x2
 *  df(x)/d(x2) = x1 + 2 *(1+ x2)
 *
 */
void nlnopt_gx(double n, double *xp, double *gp)
{
	double x1, x2, x3;
	
	x1 = xp[0];   x2 = xp[1];  x3 = xp[2];
	/*
    gp[0] = 4 * x1* x1 * x1 + x2;
    gp[1] = x1 + 2 *(1+ x2);
    */
	gp[0] = 3;
    gp[1] = -2 * 81 * (x2 + 1);
    gp[2] = 20;
}

void nlnopt_ops3(struct nlnopt_s *nlp)
{
	unsigned int i, j, m, n, dim;
	double **h1rpp, **hrpp, *h1rp, *hrp, **rpp, *rp, *sp, alpha, 
		    *xkp, *xk1p, *gkp, *gk1p, *deltap, *gammap, *p;
	struct matrix_dsc_s *H1mxp, *Hmxp, *mxp;
	
	dim = 3;
	H1mxp = matrix_create_eye(dim);
	 Hmxp = matrix_create_eye(dim);
	
	sp  = CALLOC(dim, double);
	xkp = CALLOC(dim, double);        xk1p = CALLOC(dim, double);
	gkp = CALLOC(dim, double);        gk1p = CALLOC(dim, double);
	deltap = CALLOC(dim, double);   gammap = CALLOC(dim, double);

	//xkp[0] = 0.75;    xkp[1] = -1.25;   // This is the x0 vector.
	xkp[0] = 0.1;   xkp[1] = 0.1;  xkp[2] = -0.1;  // This is the x0 vector.
	nlnopt_gx(dim, xkp, gkp);           // Compute the PD vector.
	vector_sprint(" g0 ==> ", dim, gkp);
	
	n = 15;
	alpha = -0.2;
	for ( i = 0; i < n; i++ ) {
		nlnopt_mul_H_x(deltap, alpha, Hmxp, gkp); // delta = alpha * s = -H * gk
		nlnopt_vectors_sum(dim, xk1p, deltap, xkp, 1.0);
		nlnopt_gx(dim, xk1p, gk1p);         // Compute the PD vector.
		nlnopt_vectors_sum(dim, gammap, gk1p, gkp, -1.0);

		//nlnopt_dfp(H1mxp, Hmxp, deltap, gammap);  // DFP  algorithm
		nlnopt_bfgs(H1mxp, Hmxp, deltap, gammap);   // BFGS algorithm
		mxp = H1mxp;  H1mxp = Hmxp;  Hmxp = mxp;
		p = xk1p;  xk1p = xkp; xkp = p;
		p = gk1p;  gk1p = gkp; gkp = p;
		vector_sprint("xk ===> ", dim, xkp);
	}

	free(sp);   free(xkp);   free(gkp);   free(xk1p);   free(gk1p);
	free(deltap);            free(gammap);

	matrix_dsc_fini(H1mxp);
	matrix_dsc_fini(Hmxp);
}

/** integer test version **/
void nlnopt_ops_backforward_int(struct nlnopt_s  *nlp)
{
	unsigned int  i, n, k,  x, x1, x2, x0, xa, xb;
	double *fxp, f1, f2, f0, t, h;

	n = nlp->num_points;
	fxp = nlp->fxp;
	
	x0 = nlp->x0;  // initial index, x0
	f0 = nlp->f0;  // initial function value
	t  = nlp->tao; 
	
	sprintf(dbg, "t = %g (x0, f0) = (%d, %g)", t, x0, f0); DBG(dbg);
	x1 = x2 = x0;
	f1 = f2 = f0;
	k = 0; h = nlp->h;

	while ( !( (f1 > f0) && (f0 < f2 ) ) ) {
		
		x2 = x1 + h;
		f1 = fxp[x1];
		f2 = fxp[x2];
		sprintf(dbg, "f1, f2 = %g, %g x1,x2 =%d, %d", 
			f1, f2, x1, x2); DBG(dbg); 
		if ( f2 < f1 ) {
			h = h * t;
			x  = x1;
			x1 = x2;
			f1 = f2;
			k++;
		} else {
			if ( !k ){
				h = -h;
			} else {
				xa = min(x, x2);
				xb = max(x, x2);
				break;
			}
				
		}
	
	}
	sprintf(dbg, "[xa, xb] =[%d %d]", xa, xb); DBG(dbg); 
}

double fx(double x)
{
	return x*x;
}

double dfx(double x)
{
	return 2 * x;
}

void nlnopt_ops_backforward(struct nlnopt_s  *nlp)
{
	unsigned int  i, n, k; 
	double *fxp, f1, f2, f0, t, h, x, x1, x2, x0, ak, ak1;

	n = nlp->num_points;
	fxp = nlp->fxp;
	
	x0 = nlp->x0;  // initial index, x0
	f0 = nlp->f0;  // initial function value
	t  = nlp->tao; 
	
	sprintf(dbg, "t = %g (x0, f0) = (%g, %g)", t, x0, f0); DBG(dbg);
	x1 = x2 = x0;
	f1 = f2 = f0;
	k = 0; h = nlp->h0;

	while ( !( (f1 > f0) && (f0 < f2 ) ) ) {
		
		x2 = x1 + h;
		f1 = fx(x1);
		f2 = fx(x2);
		sprintf(dbg, "f1, f2 = %g, %g x1,x2 =%g, %g", 
			f1, f2, x1, x2); DBG(dbg); 
		if ( f2 < f1 ) {
			h = h * t;
			x  = x1;
			x1 = x2;
			f1 = f2;
			k++;
		} else {
			if ( !k ){
				h = -h;
			} else {
				ak  = min(x, x2);
				ak1 = max(x, x2);
				break;
			}
				
		}
	
	}
	sprintf(dbg, "[ak, ak1] =[%g %g]", ak, ak1); DBG(dbg); 
	nlp->ak  = ak;
	nlp->ak1 = ak1;
}


void nlnopt_ops_ip(struct nlnopt_s *nlp)
{
	double *xp, *yp, *dfp, alpha1, alpha2, f1, f2, df1, df2, alpha, 
	        dx, dy, df, s, w, z;
	
	xp = nlp->ip_x;
	yp = nlp->ip_y;
	dfp= nlp->ip_df;

	alpha1 = xp[0];   f1 = yp[0];   df1 = dfp[0];
	alpha2 = xp[1];	  f2 = yp[1];   df2 = dfp[1];
	
	if ( nlp->qip1 ) {
		/** Quadratic interpolation I:  alpha1, alpha2, f1, 
	 	 *  f2 and df1 are known. Compute the minizer alpha **/
		dx = alpha2 - alpha1;
		dy = f2 - f1;
		alpha = alpha2 + 0.5 * dx * df2 /( dy/dx - df2);
	
	} else if ( nlp->qip2 ) {
		/** Quadratic interpolation II:  alpha1, alpha2, f1 
	 	 *  or df2  and df1 and df2 are known. Compute the minizer 
	 	 *  alpha, this method is also known as the secant method.  **/
		df = df2 - df1;
		dx = alpha2 - alpha1;
		alpha = alpha2 - dx * df2 / df;
	
	} else {

		dx = alpha2 - alpha1;
		dy = f2 - f1;
		s = 3 * dy / dx;
		z = s  - df1 - df2;
		w = sqrt(z * z - df1 * df2);
		alpha = alpha1 + dx * ( w - df1 - z ) /(df + 2 * w);
	}

	nlp->alpha = alpha;
}








/*** ============= NTU ML 1 and 2: Foundations and Techniques ============ **/

#define VCTYPE_ORIGINAL     1
#define VCTYPE_VARIANT      2
#define VCTYPE_RADEMACHER   3
#define VCTYPE_PANBROEK     4
#define VCTYPE_DEVROYE      5

struct nn_layer_s {      // --- matrixes for layer l ---
	unsigned int id : 16,
			use_val :  1,        // Use the validation data sets
				  xx: 15;
	unsigned char *idstrp;
	struct matrix_dsc_s *amxp,   // input or activation data
	             *dwmxp,*wmxp,   // weight gradient and weights, connecting layers (l) and (l+1)
                *dbmxp, *bmxp,   // bias matrix for -- wmxp --
				*db1_mxp,        //
			    *d1_mxp, *dmxp,  // delta matrix for layer (l+1) and (l), respectively
                        *zmxp,   // z matrix for layer (l+1)
	// -------- for validation only ----------
		*val_zmxp, *val_amxp;    
};




#define sigmoid(x)   (1.0 / (1.0 + exp(-x)) )
#define tanh(x)      (2.0 * sigmoid(2.0 * (x)) - 1.0)
/*
 *  Note:
 *
 *  sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x))
 *
 *
 *  By definition, 
 *                    exp(x) - exp(-x)
 *        tanh(x) = --------------------
 *                    exp(x) + exp(-x)
 *
 *
 *   The following equivalent expressions can be useful in computation
 *	too.
 *
 *   ---  tanh(x)  = 2 * sigmoid(2 * x) - 1                     --- 
 *   ---  tanh'(x) = 4 * sigmoid(2 * x) * ( 1 - sigmoid(2*x))   --- 
 *   ---                                                        ---
 *   ---  or                                                    ---
 *   ---                      2                                 ---
 *   ---  tanh'(x) = 1 - tanh  (x)                              ---
 *
 */

void  ntuml_tanh(struct matrix_dsc_s *mxp)
{
	unsigned int i, j, m, n;
	double **rpp, *rp, x;

	rpp =  MXRPP(mxp);
	m  =  MXROWS(mxp);
	n  =  MXCOLS(mxp);
	
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			x = rp[j];
			rp[j] = 2 * sigmoid(2 * x) - 1;
		}
	}
}

void  ntuml_tanh_prime(struct matrix_dsc_s *mxp)
{
	unsigned int i, j, m, n;
	double **rpp, *rp, x;

	rpp =  MXRPP(mxp);
	m  =  MXROWS(mxp);
	n  =  MXCOLS(mxp);
	
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			x = rp[j];
			rp[j] = 4. * sigmoid(2. * x) * ( 1. - sigmoid(-2. *x) );
		}
	}
}

void  ntuml_tanh2(struct matrix_dsc_s *mxp, struct matrix_dsc_s *amxp)
{
	unsigned int i, j, m, n;
	double **rpp, *rp, x, **arpp, *arp;

	rpp  =  MXRPP(mxp);
	arpp =  MXRPP(amxp);
	m    =  MXROWS(mxp);
	n    =  MXCOLS(mxp);
	
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		arp = arpp[i];
		for ( j = 0; j < n; j++ ) {
			x = rp[j];
			arp[j] = 2 * sigmoid(2 * x) - 1;
		}
	}
}

void  ntuml_tanh_prime2(struct matrix_dsc_s *mxp, struct matrix_dsc_s *pmxp)
{
	unsigned int i, j, m, n;
	double **rpp, *rp, x, **prpp, *prp;

	rpp  =  MXRPP(mxp);
	prpp =  MXRPP(pmxp);
	m    =  MXROWS(mxp);
	n    =  MXCOLS(mxp);
	
	for ( i = 0; i < m; i++ ) {
		rp  = rpp[i];
		prp = prpp[i];
		for ( j = 0; j < n; j++ ) {
			x = rp[j];
			prp[j] = 4 * sigmoid(2 * x) * ( 1 - sigmoid(-2*x) );
		}
	}
}


/**  ----- Create a sliced matrix using mxp as the basis, [rs, rt) x [cs, ct) ----- **/
// Note:  rs and cs are inclusive while rt and ct are exclusive, that is the last row
// and column that get extracted plus + 1 
/*        If   mxp is :                then ntuml_mxpslice(mxp, 0, 2, 1, 3)  will get 
 *              *          *        
 *      |  0    1     2    3             |    0    1    
 *    --+--------------------         ---+---------------   
 *  * 0 |  1   |2     3|   4           0 |    2    3
 *      |      |       |                 | 
 *    1 |  5   |6     7|   8           1 |    6    7
 *      |      +-------+                 |
 *  * 2 |  9   10    11   12 
 */

struct matrix_dsc_s *ntuml_mxpslice(struct matrix_dsc_s *mxp,
	unsigned int rs, unsigned int rt, unsigned int cs, unsigned int ct)
{
	unsigned int *rip, *cip, m, n;
	struct matrix_dsc_s *smxp;

	if ( (rs < rt) && (cs < ct) ) {
		m = rt - rs;
		n = ct - cs;
		
		rip = nau_gen_range(rs, rt);
		cip = nau_gen_range(cs, ct);

		matrix_slice_params(mxp, m, n);
		matrix_slice_set_params(mxp, rip, cip);
		smxp = matrix_slice(mxp);
		
		free(rip), free(cip);

	} else {
		
		sprintf(dbg, "Warning: supplied ranges are not correct [%d, %d), [%d, %d)",
			rs, rt, cs, ct);   DBG(dbg); 

		smxp = NULL;
	
	}

	return smxp;
}

/**
 *
 *  Input : a list specifying the number of units in each layer.
 *  Output: a list of tuples, which specify the numbers of rows and 
 *          columns of weight matrixes between each layer.
 *         
 *        # matrixes =  # layers - 1
 *        # row-column of the tuples = # matrixes * 2
 *          
 **/
unsigned int *nn_wmxp_params(unsigned int num_l, unsigned int l[], unsigned int *num_a)
{
	unsigned int i, j, n, m, *ap, dbg_flag;
	
	n = num_l -1;

	*num_a = n * 2;
	ap  = MALLOC(n*2, unsigned int);
	
	dbg_flag = 0;
	if ( dbg_flag ) {
		DBG("NN cfg list...");
		vector_print_int(num_l, l);
		DBG("end of NN cfg list...");
	}

	for ( i = 0; i < n; i++ ) {
		ap[2*i] = l[i];
		ap[2*i+1] = l[i+1];
		if ( dbg_flag ) {
			sprintf(dbg, "wmxp %d : (%3d x %-3d)",
				i, ap[2*i], ap[2*i+1]);
			DBG(dbg); 
		}
	}

	return ap;
}




/*
 *  The following three routines can be used as a template to implement
 *	the matrix multiplication
 *
 *  A.T() means transposed A matrix
 *  B.T() means transposed B matrix
 *
 *  	A     *  B      = C
 *  	A.T() *  B      = C
 *  	A     *  B.T()  = C
 *
 */
void nn_mul_abc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	unsigned int y, i, j, k, m, n, l;
	double t, **arpp, **brpp, **crpp, *arp, *brp, *crp;

	y = mxmul_check_abc(amxp, bmxp, cmxp);
	if ( !y )
		return;
	
	m = MXROWS(cmxp);    arpp = MXRPP(amxp);
	n = MXCOLS(cmxp);    brpp = MXRPP(bmxp);
	l = MXCOLS(amxp);    crpp = MXRPP(cmxp);

	for ( i = 0; i < m; i++ ) {
		crp = crpp[i];   // move along the rows of C
		arp = arpp[i];
		for ( j = 0; j < n; j++ ) {
			t = 0;
			for ( k = 0; k < l; k++ ) {
				brp = brpp[k];
				t += arp[k] * brp[j];
			}
			// --- By this step, t holds the inner product of the row vector
			//  of A and column vector of B
			// --- 1. if there is bias to the t, add it now
			//   t += bias;
			// --- 2. if there is a function to transform t, for example, 
			// the sigmoid function,  t = 1/(1 + exp(-t));

			crp[j] = t; // move along the columns of this row
		}
	}
	
}

void nn_mul_aTbc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	unsigned int y, i, j, k, m, n, l;
	double t, **arpp, **brpp, **crpp, *arp, *brp, *crp;

	y = mxmul_check_aTbc(amxp, bmxp, cmxp);
	if ( !y )
		return;
	
	m = MXROWS(cmxp);    arpp = MXRPP(amxp);
	n = MXCOLS(cmxp);    brpp = MXRPP(bmxp);
	l = MXROWS(amxp);    crpp = MXRPP(cmxp);

	for ( i = 0; i < m; i++ ) {
		crp = crpp[i];   // move along the rows of C
		
		for ( j = 0; j < n; j++ ) {
			t = 0;
			for ( k = 0; k < l; k++ ) {
				arp = arpp[k];
				brp = brpp[k];
				t += arp[i] * brp[j];
			}
			// --- By this step, t holds the inner product of the row vector
			//  of A and column vector of B
			// --- 1. if there is bias to the t, add it now
			//   t += bias;
			// --- 2. if there is a function to transform t, for example, 
			// the sigmoid function,  t = 1/(1 + exp(-t));

			crp[j] = t; // move along the columns of this row
		}
	}
	
}

void nn_mul_abTc(struct matrix_dsc_s *amxp, struct matrix_dsc_s *bmxp, struct matrix_dsc_s *cmxp)
{
	unsigned int y, i, j, k, m, n, l;
	double t, **arpp, **brpp, **crpp, *arp, *brp, *crp;

	y = mxmul_check_abTc(amxp, bmxp, cmxp);
	if ( !y )
		return;
	
	m = MXROWS(cmxp);    arpp = MXRPP(amxp);
	n = MXCOLS(cmxp);    brpp = MXRPP(bmxp);
	l = MXCOLS(amxp);    crpp = MXRPP(cmxp);

	for ( i = 0; i < m; i++ ) {
		crp = crpp[i];   // move along the rows of C
		arp = arpp[i];
		for ( j = 0; j < n; j++ ) {
			t = 0;
			brp = brpp[j];
			for ( k = 0; k < l; k++ ) {
				t += arp[k] * brp[k];
			}
			//
			// --- By this step, t holds the inner product of the row vector
			//  of A and column vector of B
			// --- 1. if there is bias to the t, add it now
			//   t += bias;
			//
			// --- 2. if there is a function to transform t, for example, 
			// the sigmoid function, put it thru this function, e.g. t = 1/(1 + exp(-t));
			//

			crp[j] = t; // move along the columns of this row
		}
	}
	
}


/** ---------------- MFCC  --------------- **/
#define MFCC_EPS   2.2204e-16   // the same EPS as in Octave
struct mfcc_s {
	unsigned int num_filters, num_ceps, fft_len, num_trainsets, num_valsets;
	struct matrix_op_s *mop;
	struct matrix_dsc_s *fbankmxp;
	double freq_lo, freq_hi, *dftbinp, *freqp, *melp, **filterpp;
	
	// ------- audio info and the data -------
	char *audio_fname;  
	int   audio_label;
	
	// ---- features and coefficient matrices ----
	struct matrix_dsc_s *featmxp, *ccmxp, *valmxp;  
	double *audio_dp, *featp, *svalp, *workp;
	double  frame_interval_len, frame_interval_inc;  // in seconds
	unsigned int sample_rate, num_bits_per_sample, num_dps, num_padded_dps,
	 frame_len, frame_inc, num_frames, num_padding, feat_len;

	// ------------ FFT ------------
	double *realsigp, *cosp, *c2sp, *sinp, *s2np, *xrp, *xip,
	       *xrealp, *ximagp, *psp, *dctp;

};

struct wav_s {
	char id[4];           //  "RIFF" chunk descriptor
	unsigned int size;    //  36 + SubChunk2Size
	char type[4];         //  "WAVE"

	char fmt[4];          //  "fmt "  subchuck id 
	unsigned int fmt_length;  // 16 for PCM, this is the size of therest of the Subchuck
	                          // which follows this number
	unsigned short fmt_type,  // 1 for PCM (linear quantization)
	                          num_channels; 
	unsigned int sample_rate;
	unsigned int byte_rate;
	unsigned short blockalign, bits_per_sample;

	char data[4];         // data chunck
	unsigned int data_size;
};

struct mfcc_s  *mfcc_init(struct na_params_s *p);
void  mfcc_fini(struct mfcc_s  *mfccp);
void  mfcc_read_data(struct mfcc_s *mfccp);
void  mfcc_ops(struct mfcc_s  *mfccp);
void  mfcc_ops3(struct mfcc_s  *mfccp);
void  mfcc_op_fbank(struct mfcc_s  *mfccp);
void  mfcc_op_frame(struct mfcc_s  *mfccp);
void  mfcc_op_plot(struct mfcc_s *mfccp);
void  mfcc_coefficients(struct mfcc_s *mfccp);
void  mfcc_features(struct mfcc_s *mfccp);
struct matrix_dsc_s *nn_feat_extract(struct mfcc_s *mfccp, char *wavfilep);
void  fft_gen_cstable(unsigned int N, double *cp, double *sp);
void  fft_gen_cstable_half(unsigned int N, double *cp, double *sp);
void  mfcc_framing_params(unsigned int num_samples, unsigned int frame_len, unsigned int frame_inc, unsigned int *no_frames, unsigned int *no_padding, unsigned int *no_padded_signal);
double *mfcc_one_filter(unsigned int a, unsigned int b, unsigned int c);
void  fft_stalone(unsigned int N, double *cp, double *sp, double *xrealp, double *ximagp);
void  mfcc_fft_psp(unsigned int n, double *realsigp, double *cp, double *sp, double *c2p,
double *s2p, double *xrealp, double *ximagp, double *xrp, double *xip, double *psp);
void  mfcc_print_filters(struct mfcc_s *mfccp);
unsigned int ibitr(unsigned int j, unsigned int nu);


double freq2mel(double f)
{
	return 1125.0 * log(1.0 + f/700.0);
}

double mel2freq(double mel)
{
	return 700.0 * (exp(mel/1125.0) - 1.0);
}

/*------------ read audio data from a WAV file for MFCC ----------- */
//
//   Read the samples from the WAV file.  Set the following three parameters,
//   - num_dps:      number of samples in the signal
//   - sample_rate:  sampling rate at which the signal was sampled,
//        high(est) frequency of the signal is half of the sampling rate
//   - num_bits_per_sample:  the number of bits to represent the 
//        amplitude of the signal 
//
void mfcc_read_data(struct mfcc_s *mfccp)
{
	struct wav_s w, *wp;
	unsigned int i, num_dps,  num_bits_per_sample;
	short t;
	size_t sz = 5;
	double *dp, amp;
	char *fname;
	FILE *fp;
	
	fname = mfccp->audio_fname;
	
	fp = fopen(fname, "r");
	if ( !fp ) {
		sprintf(dbg, "error to open %s", fname); 
		DBG(dbg); 
		return;
	}
	
	wp = &w;
	fread((void *)(wp), sizeof(struct wav_s), 1, fp);
	
	if ( 0 ) {
		snprintf(dbg, sz, "%s\n", wp->id);    DBG(dbg); 
		snprintf(dbg, sz, "%s\n", wp->type);  DBG(dbg); 
		snprintf(dbg, sz, "%s\n", wp->fmt);   DBG(dbg); 
		snprintf(dbg, sz, "%s\n", wp->data);  DBG(dbg); 

		sprintf(dbg, "size +8    = %d",  wp->size+8   );    DBG(dbg); 
		sprintf(dbg, "data size  = %d",  wp->data_size);    DBG(dbg); 
		sprintf(dbg, "fmt length = %d",  wp->fmt_length);   DBG(dbg); 
		sprintf(dbg, "fmt type   = %d",  wp->fmt_type);     DBG(dbg); 
		sprintf(dbg, "# of chan. = %d",  wp->num_channels); DBG(dbg); 
		sprintf(dbg, "sample rate= %d",  wp->sample_rate);  DBG(dbg); 
		sprintf(dbg, "byte rate  = %d",  wp->byte_rate);    DBG(dbg); 
		sprintf(dbg, "block align = %d", wp->blockalign);   DBG(dbg); 
	}
	
	mfccp->sample_rate = wp->sample_rate;
	mfccp->freq_hi = wp->sample_rate / 2.0;
	mfccp->num_dps = num_dps = wp->data_size / ( wp->fmt_length / 8);
	mfccp->num_bits_per_sample = wp->fmt_length;
	
	amp = (double) ( 1 << (wp->fmt_length-1) );  // "amplitude" for normalization
	if ( 0 ) {
		sprintf(dbg, "WAV sample rate = %d", mfccp->sample_rate); DBG(dbg); 
		sprintf(dbg, "WAV data size   = %d", mfccp->num_dps    ); DBG(dbg); 
		sprintf(dbg, "WAV num_bits/sample= %d", 
			mfccp->num_bits_per_sample ); DBG(dbg); 
		sprintf(dbg, "WAV amplitude for normalization = %g", amp); DBG(dbg); 
	}
	mfccp->audio_dp = dp = realloc( mfccp->audio_dp, num_dps * sizeof(double) );
	
	if ( !dp ) {
		sprintf(dbg, "realloc() error for auddio file  = %s",
			fname); DBG(dbg); 
		return;
	}

	sz = wp->fmt_length / 8;
	for ( i = 0; i < num_dps; i++ ) {
		fread(&t, sizeof(t), 1, fp);
		dp[i] = (double)(t);
		if ( 1 ) {    // Set this to be 1 if normalization is need.
			dp[i] /= amp;
		}
	}
	
	fclose(fp);
}

/*
 *   Generate some test data for the FFT.
 *
 *   x(t) = exp(-t) 
 *
 * 	 in:  
 * 	 	interval:  the time interval to run FFT
 * 	 	samplingrate : the sampling rate to generate the test samples
 *   out: 
 *   	dp:  the sampling points generated
 *   	num_points:  the total number of points generated
 */

double *fft_gen_testdata(double interval, double samplingrate, unsigned int *num_points)
{
	unsigned int i, n;
	double *dp;

	n = (double)(interval * samplingrate);
	dp = MALLOC(n, double);

	for ( i = 0; i < n; i++ ) {
		dp[i] =  exp( -(double)(i) );
	}
	
	*num_points = n;

	return dp;
}


void run_mfcc(struct na_params_s *p)
{
	struct mfcc_s  *mfccp;
	
	mfccp = mfcc_init(p);
	
	mfcc_ops(mfccp);
	mfcc_ops3(mfccp);  

	mfcc_fini(mfccp);

}

/*
 *   Currently, I hard-coded the parameters for MFCC operations. 
 *
 *   Later, all the params should be passed thru the structure
 *    
 *
 *     struct na_params_s *p, 
 *
 *   which receives its params from the command line or by
 *   reading a params/config file.
 *                                      
 *                                      Mark, May 22, 2016
 *
 */
struct mfcc_s  *mfcc_init(struct na_params_s *p)
{
	struct mfcc_s  *mfccp;
	struct matrix_op_s *mop;
	struct matrix_dsc_s *fbankmxp, *fftmxp;
	unsigned int i, num_filters, num_ceps, fft_len, feat_len;
	double **rpp, *freqp, *melp;

	mfccp = CALLOC(1, struct mfcc_s); 
    mfccp->mop = mop = matrix_op_init2("ntuml");
	mfccp->num_filters = num_filters = 40;
	mfccp->num_ceps = num_ceps = 20;
	mfccp->filterpp = MALLOC(num_filters, double *);
	mfccp->freq_lo  = 0;          // not set
	mfccp->freq_hi  = 512;        // 1 kHz, 512Hz
	mfccp->frame_interval_len = 0.512;  // 0.512, 1.024 seconds
	mfccp->frame_interval_inc = 0.015;  // increment by 0.1, 0.015 0.01, 0.005 seconds
	mfccp->fft_len  = fft_len = 512;   // 1024 512 or 256, should be enough
	mfccp->sample_rate = 2000;

	i = 10;
	fbankmxp = matrix_create2(mop, "MFCC filter bank", i, num_filters + 5);
	rpp = MXRPP(fbankmxp), i = 0;
	mfccp->freqp  = rpp[i++];
	mfccp->melp   = rpp[i++];
	mfccp->dftbinp= rpp[i++];
	mfccp->svalp  = rpp[i++];  // singular values for the feature matrix
	mfccp->workp = malloc( 50 * num_ceps * sizeof(double) );
	
	i = 12;   // Enough to cover the storage used by FFT
	fftmxp = matrix_create2(mop, "FFT storage", i, fft_len);
	rpp = MXRPP(fftmxp), i = 0;
	mfccp->xrealp = rpp[i++];   // real part of the signal FFT
	mfccp->ximagp = rpp[i++];   // imaginary part of the signal FFT
	mfccp->cosp   = rpp[i++];   // lookup table for cos(x)
	mfccp->sinp   = rpp[i++];   // lookup table for sin(x)
	mfccp->c2sp   = rpp[i++];   // lookup table for cos(x/2)
	mfccp->s2np   = rpp[i++];   // lookup table for sin(x/2)
	mfccp->xrp    = rpp[i++];   // temporary real part of FFT
	mfccp->xip    = rpp[i++];   // temporary imaginary part of FFT
	mfccp->psp    = rpp[i++];   // power spectrum of the FFT
	mfccp->dctp   = rpp[i++];   // temporary storage for DCT 2 results
	                            // fft_len >> length of each filter in 
								// the filter bank, so no worries
	mfccp->realsigp = rpp[i++]; // input real signal, used when only debug
	fft_gen_cstable( fft_len>>1 ,     mfccp->cosp, mfccp->sinp);
	fft_gen_cstable_half(fft_len>>1,  mfccp->c2sp, mfccp->s2np);
	
	if ( 0 ) {
		mfccp->feat_len = feat_len = num_filters * num_filters;
	} else {
		mfccp->feat_len = feat_len = num_ceps * num_ceps;
	}
	mfccp->featmxp = matrix_create2(mop, "features per signal",  2, num_filters);
	//mfccp->ccmxp   = matrix_create2(mop, "coefficients", 2, feat_len);
	mfccp->valmxp  = matrix_create2(mop, "validation coefficients", 2, feat_len);

	return mfccp;

}

void mfcc_fini(struct mfcc_s  *mfccp)
{
	unsigned i, n;
	double **fpp, *fp;


	n = mfccp->num_filters;
	
	if (mfccp->filterpp) {
		fpp = mfccp->filterpp;
		for ( i = 0; i < n; i++) {
			fp = fpp[i];
			free(fp);
		}
		free(mfccp->filterpp);
	}
	
	free(mfccp->workp);
	free(mfccp->audio_dp);
	
	matrix_op_fini(mfccp->mop);

}

void mfcc_ops(struct mfcc_s  *mfccp)
{
	mfcc_op_fbank(mfccp);
	mfcc_op_frame(mfccp);
}

/** ----- For physionet challenge 2016 only, extract the feature from the given WAV file ---- **/
struct matrix_dsc_s *nn_feat_extract(struct mfcc_s *mfccp, char *wavfilep)
{
	unsigned int  nn, num_dps, num_frames, frame_len, frame_inc,
		num_padding, num_padded_samples, feat_len, num_filters;
	int lbl;
	struct matrix_op_s *mop = MOP(mfccp);
	struct matrix_dsc_s *featmxp, *ccmxp;
	double **rpp, *rp;

	mfccp->audio_fname = wavfilep;
	mfcc_read_data(mfccp);       // read the WAV file 
	//
	//  We are predicting the label, so mfccp->audio_label won't 
	//  be set.   Computed labels are subject to third-party 
	//  evaluation.
	//
	featmxp = mfccp->featmxp;
	feat_len = mfccp->feat_len;
	nn = 1;  
	mfccp->ccmxp = ccmxp =  matrix_create("coefficients", nn, feat_len+1);
	rpp = MXRPP(ccmxp);
	
	num_filters = mfccp->num_filters;
	num_dps   = mfccp->num_dps;
	frame_len = mfccp->frame_len;
	frame_inc = mfccp->frame_inc;

	mfcc_framing_params(num_dps, frame_len, frame_inc, 
		&num_frames, &num_padding, &num_padded_samples);
	
	matrix_resize(featmxp, num_frames, num_filters);

	mfccp->num_frames = num_frames;
	mfccp->num_padding = num_padding;
	mfccp->num_padded_dps = num_padded_samples;
	mfccp->featp = rpp[0];   // one row matrix, only one WAV file 
	mfcc_coefficients(mfccp);
	mfcc_features(mfccp);

	return ccmxp;
}

/*
 *   feature extraction from the audio files
 *   
 *   The numbers of training and validation files are recorded.
 *
 *   All the data files, training and validtion, are read in,
 *   transformed and stored into a single matrix first.  The
 *   first rows are for training and the remaing rows are for
 *   validtion. 
 *
 */
void mfcc_ops3(struct mfcc_s  *mfccp)
{
	unsigned int i, n, nn, k, num_files, total_num_files, num_frames, 
	    num_padding, num_padded_samples, feat_len, num_filters;
	int lbl;
	char root[] = "/home/mark/Downloads/cs/ml/mit/contests/2016",
	    *dirs[] = { "training-a",  "training-b",  "training-c" , 
		            "training-d",  "training-e",  "validation" },
		  toc[] = "REFERENCE.csv", 
	     fname[512], ftoc[512], buf[32], buf2[16], *p;
	struct matrix_dsc_s *featmxp, *ccmxp, *valmxp;
	double **rpp, *rp;
	FILE *fp;
	
	featmxp = mfccp->featmxp;
	//ccmxp   = mfccp->ccmxp;
	valmxp  = mfccp->valmxp;

	n = ARRAYSIZE(dirs);
	total_num_files = 0;
	for ( i = 0; i < n-1; i++ ) {
		sprintf(ftoc, "%s/%s/%s", root, dirs[i], toc); 
		num_files = nau_num_lines(ftoc);
		total_num_files += num_files;
		sprintf(dbg, "%s : %4d", dirs[i], num_files); DBG(dbg); 
	}
	
	if ( 1 ) {
		sprintf(dbg, "Total # files(training)  = %4d", 
			total_num_files); DBG(dbg); 
	}
	mfccp->num_trainsets = total_num_files;  // no. of training sets
	
	sprintf(ftoc, "%s/%s/%s", root, dirs[i], toc); 
	total_num_files = nau_num_lines(ftoc);
	
	if ( 1 ) {
		sprintf(dbg, "Total # files(validation) = %4d",
			total_num_files); DBG(dbg); 
	}
	mfccp->num_valsets = total_num_files;    // no. of validation sets
	num_filters = mfccp->num_filters;

	feat_len = mfccp->feat_len;
	nn = mfccp->num_trainsets + mfccp->num_valsets;
	
	mfccp->ccmxp = ccmxp = 
				matrix_create2(MOP(mfccp), "coefficients", nn, feat_len+1);
	k = 0,  rpp = MXRPP(ccmxp);  
	
	//
	// rpp is the pointer array of coefficient matrix ccxmp.
	// k is the index into rpp;  each rp in rpp holds one feature
	// extracted from each audio signal file.
	//

	// Loop through all the directories of training data
	for ( i = 0; i < n; i++ ) {
		sprintf(ftoc, "%s/%s/%s", root, dirs[i], toc); 
		fp = fopen(ftoc, "r");
		
		if ( fp ) {
			
			// Read each audio file from the TOC in that directory.
			while ( fgets(buf, sizeof(buf), fp) ) {
				p = strstr(buf, ",");
				*p = ' ';
				sscanf(buf, "%s %d\n", buf2, &lbl);
				// Construct the full file spec of the audio signal.
				sprintf(fname, "%s/%s/%s.wav", root, dirs[i], buf2); 
				// Pass along the params and call the routine 
				// to process this signal.
				mfccp->audio_fname = fname;  // the WAV file to be processed
				mfccp->audio_label = lbl;    // the label for this WAV file
				mfcc_read_data(mfccp);   // read the audio file into mfccp->audio_dp
				mfcc_framing_params(mfccp->num_dps, mfccp->frame_len,
				mfccp->frame_inc, &num_frames, &num_padding, &num_padded_samples);
				sprintf(dbg, "%s: %4d %4d %4d %4d", fname, mfccp->num_dps, 
					num_frames, num_padding, num_padded_samples); DBG(dbg);

				mfccp->num_frames = num_frames;
				mfccp->num_padding = num_padding;
				mfccp->num_padded_dps = num_padded_samples;

				// resize the feature matrix for the filter bank
				matrix_resize(featmxp, num_frames, num_filters);
				mfccp->featp = rpp[k++];
				mfcc_coefficients(mfccp);
				mfcc_features(mfccp);
				mfccp->featp[feat_len] = lbl;
			}

			fclose(fp);

		} else {
			sprintf(dbg, "error to open TOC file \"%s.\". Please check.",
				fname); DBG(dbg); 
		}
	}

	/*
	if ( 0 )
		matrix_write_file("ccmxp", ccmxp);
	else
		matrix_write_file("ccmxp2", ccmxp);
	*/
}



/*
 *  Given the signal length,  frame length and increment in numbers of samples,
 *  compute the total number of frames and the possible number of padding.
 * 
 *
 */
void  mfcc_framing_params(unsigned int num_samples, unsigned int frame_len, unsigned int frame_inc, unsigned int *no_frames, unsigned int *no_padding, unsigned int *no_padded_signal)
{
	unsigned int frag, num_frames, siglen, num_padding;

	frag = (num_samples - frame_len) % frame_inc;
	num_frames = (num_samples - frame_len)/frame_inc + 1;
	num_frames += frag ? 1 : 0;
	
	// siglen is the total lenght in no. of samples
	siglen = (num_frames - 1) * frame_inc + frame_len;  
	num_padding = siglen - num_samples;

	*no_frames  = num_frames;
	*no_padding = num_padding;
	*no_padded_signal = siglen;
}

/** ================== Construct the filter bank ================ **/
void mfcc_op_fbank(struct mfcc_s  *mfccp)
{
	unsigned int i, n, n1;
	unsigned int a, b, c;
	double *freqp, *melp, *dftbinp, hi, samplerate, fft_len, *filterp,
		    mel, meldelta, freq_lo, freq_hi, mel_lo, mel_hi;

	freqp   = mfccp->freqp,      melp  = mfccp->melp;  
	freq_lo = mfccp->freq_lo,   mel_lo = freq2mel(freq_lo);
	freq_hi = mfccp->freq_hi,   mel_hi = freq2mel(freq_hi);

	n = mfccp->num_filters + 1;
	n1 = n+1;

	mel = mel_lo;
	meldelta = (mel_hi - mel_lo) / (double)(n);

	for ( i = 0; i < n1; i++ ) {
		melp[i] = mel;
		freqp[i] = mel2freq(mel);
		mel += meldelta;
	}
	
	samplerate = mfccp->sample_rate, fft_len = mfccp->fft_len;
	dftbinp = mfccp->dftbinp;
	for ( i = 0; i < n1; i++  ) {
		hi = freqp[i];
		dftbinp[i] = floor( (fft_len + 1.0) * hi / samplerate );
	}

	
	n = mfccp->num_filters;
	for ( i = 0; i < n; i++ ) {
		a = (unsigned int)( dftbinp[i+0]);
		b = (unsigned int)( dftbinp[i+1]);
		c = (unsigned int)( dftbinp[i+2]);
		
		mfccp->filterpp[i] = mfcc_one_filter(a, b, c);
		if ( !(mfccp->filterpp[i]) ) {
			sprintf(dbg, "Error to allocate memory for filter %d ...(%d, %d)", 
			i, a, c); DBG(dbg); 
			break;
		}
	}

	if ( 0 ) {
		sprintf(dbg, "no. filters = %d",  mfccp->num_filters); DBG(dbg);
		sprintf(dbg, "freq = %8.3g  mel = %8.3g", freq_lo, mel_lo); DBG(dbg);
		sprintf(dbg, "freq = %8.5g  mel = %8.5g", freq_hi, mel_hi); DBG(dbg);
		vector_sprint("freq", n1, freqp);
		vector_sprint("mel ", n1, melp);
		vector_sprint("fft bin #", n1, dftbinp);
		mfcc_print_filters(mfccp);
	}

}

/** --------------- Compute the framing parameters -------------------
 *  
 *  the analysis frame in the number of sample points in the WAV file
 *  the increment size of successive frames in the number of sample
 *  points in the WAV file
 *
 **/
void  mfcc_op_frame(struct mfcc_s  *mfccp)
{
	unsigned int i, n, rate, frag, num_samples, frame_len, frame_inc,
		 num_frames, sample_rate, freq_hi;
	double  frame_interval_len, frame_interval_inc;
	
	freq_hi = mfccp->freq_hi;
	sample_rate = mfccp->sample_rate;
	num_samples = mfccp->num_dps;
	
	frame_interval_len = mfccp->frame_interval_len;
	frame_interval_inc = mfccp->frame_interval_inc;

	//frame_len = (unsigned int)( frame_interval_len * (double)freq_hi);
	//frame_inc = (unsigned int)( frame_interval_inc * (double) freq_hi);
	frame_len = (unsigned int)( frame_interval_len * 1000);
	frame_inc = (unsigned int)( frame_interval_inc * 1000) ;
	
	mfccp->frame_len = frame_len;
	mfccp->frame_inc = frame_inc;
}

void  mfcc_print_filters(struct mfcc_s *mfccp)
{
	unsigned int i, j, n, a, b, c;
	double **fpp, *fp, *dftbinp;
		
	fpp = mfccp->filterpp;
	dftbinp = mfccp->dftbinp;
	n = mfccp->num_filters;

	sprintf(dbg, " no. filters = %d", n); DBG(dbg); 
	
	for ( i = 0; i < n; i++ ) {
	
		a = (unsigned int)( dftbinp[i+0]);
		b = (unsigned int)( dftbinp[i+1]);
		c = (unsigned int)( dftbinp[i+2]);
	
		sprintf(dbg, "a, b, c = %3d, %3d, %3d", a, b, c ); DBG(dbg); 
		fp = fpp[i];
		for ( j = a; j < c; j++ ) {
			sprintf(dbg, "%3d : %g", j, fp[j-a]); DBG(dbg); 
		}
	}
}


/*
 *  Compute the MFCC coefficients for this signal.
 *  Run the FFT results thru the filter bank.
 *  Then put the results thru log-transformation and DCT 2 transformation.
 *
 *  On entry, the number of frames of this signal was already computed and .
 *
 */
void  mfcc_coefficients(struct mfcc_s *mfccp)
{
	unsigned int i, j, n, m, a, b, c, num_frames, frame_inc,
		num_filters, fft_len, num_dps, num_padded_dps; 
	struct matrix_dsc_s *featmxp;
	double **fpp, *fp, *dftbinp, *audio_dp, *dp, *psp, *dctp, *fbankdp,
	  *realsigp, *cp, *sp, *c2p, *s2p, *xrealp, *ximagp, *xrp, *xip,
	  cc, **rpp, *rp;
	
	fpp = mfccp->filterpp;  // filter bank or weighting function for DFT
	fft_len     = mfccp->fft_len;
	featmxp     = mfccp->featmxp;
	audio_dp    = mfccp->audio_dp;
	dftbinp     = mfccp->dftbinp;    // corresponding DFT bins
	num_filters = mfccp->num_filters;
	num_frames  = mfccp->num_frames;
	frame_inc   = mfccp->frame_inc;
	dctp        = mfccp->dctp;
	num_dps     = mfccp->num_dps;	
	num_padded_dps = mfccp->num_padded_dps;	

	if ( num_dps !=  num_padded_dps ) {
		audio_dp = realloc(audio_dp, num_padded_dps * sizeof(double));
		if ( !audio_dp ) {
			sprintf(dbg, "realloc() error = %d", errno); DBG(dbg); 
			return;
		}

		for ( i = num_dps; i < num_padded_dps; i++ ) {
			audio_dp[i] = 0.0;   // zero-padded
		}
		mfccp->audio_dp = audio_dp;
	}
	

	rpp = MXRPP(featmxp);
	
	psp = mfccp->psp;
	cp = mfccp->cosp;        c2p = mfccp->c2sp;
	sp = mfccp->sinp;        s2p = mfccp->s2np;
	xrealp = mfccp->xrealp;  xrp = mfccp->xrp;
	ximagp = mfccp->ximagp;  xip = mfccp->xip;

	dp = audio_dp;
	if ( 0 ) {
		sprintf(dbg, "fft_len = %4d num_(frames, filters) = (%d, %d) ", 
		fft_len, num_frames, num_filters); DBG(dbg); 
	}

	for ( i = 0; i < num_frames; i++ ) {
		rp = rpp[i];
		mfcc_fft_psp(fft_len, dp, cp, sp, c2p, s2p, xrealp, ximagp, xrp, xip, psp);
		
		for ( j = 0; j < num_filters; j++ ) {
	
			a = (unsigned int)( dftbinp[j+0]);
			b = (unsigned int)( dftbinp[j+1]);
			c = (unsigned int)( dftbinp[j+2]);

			if ( 0 ) {
				sprintf(dbg, "a, b, c = %d, %d, %d",  a, b, c);
				DBG(dbg); 
			}
			fp = fpp[j];
			m = c - a;         // length of the current filter
			fbankdp = psp + a; // starting pointer of the power spectrum
			cc = vectors_inner_prod(m, fbankdp , fp);
			
			// save the log() of cc 
			if ( cc == 0.0 ) 
				rp[j] = log(MFCC_EPS);
			else
				rp[j] = log(cc);
		}
		
		// DCT transformation:  1) select DCT;   2) select DCT type 2
		dstdct(num_filters, rp, dctp, 1, 2);  
		
		// copy back the results to rp
		memcpy(rp, dctp, num_filters * sizeof(double) );  

		dp += frame_inc;
	}


}

/*
 *  Given the computed MFCC matrix, extract the features from
 *  this matrix.  This is the feature vector for one signal.
 *
 *  Compute the eigenvectors of this coefficient matrix.
 *
 */
void mfcc_features(struct mfcc_s *mfccp)
{
	char ju, jvt;
	unsigned int num_ceps;
	int m, n, mn, ns, dims, lda, ldu, ldvt, lwork, ok, na;
	double *Ap, *up, *sp, *vTp, *work;
	struct matrix_dsc_s *Amxp;

	Amxp = mfccp->featmxp;
	num_ceps = mfccp->num_ceps;

	if ( num_ceps != mfccp->num_filters ) {
		matrix_resize(Amxp, MXROWS(Amxp), num_ceps);
	}

	// This is the transpose of the feature matrix. 
	m = MXCOLS(Amxp),  n = MXROWS(Amxp);

	ju = 'A',   jvt = 'N';      // Compute full U but no VT matrix.
	mn = m * n,  ns = min(m, n);   lda = m;
	up = mfccp->featp;             ldu = m;
    sp = mfccp->svalp;             lwork = 50 * min(m, n); //
	vTp = NULL;     ldvt = n;      work  = mfccp->workp;
	                               //work = MALLOC(lwork, double);
	Ap = matrix2vector(Amxp, &na, MATVEC_XTYPE_ROW);
	if ( !Ap ) {
		sprintf(dbg, "realloc() error:  %d", errno); DBG(dbg); 
		return;
	}


	// LAPACK FORTRAN subroutine DGESVD to compute 
	// the U matrix (left singular vectors)
	//
    dgesvd_(&ju, &jvt,  &m,   &n,  Ap, &lda, sp, up, &ldu,
	        vTp, &ldvt, work, &lwork,  &ok);

	if ( ok < 0 ) {
		sprintf(dbg, "Please check argument %d, which is illegal.", 
			abs(ok) ); DBG(dbg); 
	} else if ( ok > 0 ) {
		DBG("Unfortunately, the subroutine DBDSQR() , which DGESVD calls, did not converge.");
	} else {
	    // Everything is ok, here are the results.
		//
		// Here the orthonormal eigenvector (column) vectors may
		// be subject to review for debugging purposes.  By design,
		// the mfccp->featp changes to point to the next row in 
		// the generated coefficient matrix.  This means the results
		// of those eigenvectors are automatically stored in the 
		// coefficeint matrix by the called FORTRAN routine. 
		//
	}

	free(Ap);  
}


/*
 *   Based on the "band" of the filter, create the "triangular" 
 *   filter (in the frequency domain) ( to reduce the leakage,
 *   the side-lobe).
 *
 */

double *mfcc_one_filter(unsigned int a, unsigned int b, unsigned int c)
{
	unsigned int i, j;
	double *filterp, dx, dy;
	
	filterp = MALLOC(c - a, double);

	if ( !filterp ) {
		return filterp;
	}

	// "rising" slope
	dx = (double)(b - a);
	for ( i = a; i < b; i++ ) {
		dy = (double)(i - a);
		filterp[i-a] = dy / dx;
	} 

	// "falling" slope
	dx = (double)(c - b);
	for ( i = b; i < c; i++ ) {
		dy = (double)(c - i);
		filterp[i-a] = dy / dx;
	} 

	return filterp;
}


/*
 *   Generate the GNUplot script to plot the visual of the generated
 *   filter bank.   If correct, we can see the "regular" triangular
 *   filters (weighting functions).
 *
 *   On system with GNUplot installed, type in command 
 *
 *      gnuplot x1
 *  
 *   will give the visual. 
 */
void mfcc_op_plot(struct mfcc_s *mfccp)
{
	unsigned int i, n;
	double *freqp, *melp, midf, f1, f2, 
	        midmel, m1, m2, freq_lo, freq_hi, mel_lo, mel_hi;
	FILE *fp;

	  freqp = mfccp->freqp,          melp = mfccp->melp;
	freq_lo = mfccp->freq_lo,      mel_lo = freq2mel(freq_lo);
	freq_hi = mfccp->freq_hi,      mel_hi = freq2mel(freq_hi);
	
	sprintf(dbg, " freq(lo, hi)= (%g, %g)", freq_lo, freq_hi); DBG(dbg); 
	sprintf(dbg, " mel(lo, hi)= (%g, %g)", mel_lo, mel_hi); DBG(dbg); 
	
	fp = fopen("x1", "w");
	n = mfccp->num_filters;
	sprintf(dbg, "# filters = %d", n); DBG(dbg); 
	
	fprintf(fp, "set xrange [%g:%g]\n", freq_lo, freq_hi);
	fprintf(fp, "set yrange [0:1]\n");

	for ( i = 1; i <= n; i++ ) {
		midf = freqp[i];
		f1 = freqp[i-1], f2 = freqp[i+1];
		fprintf(fp, "set arrow from %g, 1 to %g, 0 nohead\n", midf, f1);
		fprintf(fp, "set arrow from %g, 1 to %g, 0 nohead\n", midf, f2);
		
	}
	
	fprintf(fp, "plot 0\npause -1\n");
	
	if ( 0 ) {
		// On Linux/Unix, execute the shell command 
		// provided GNUplot is installed.
		system("gnuplot x1");  
	}

	fclose(fp);
}

/*
 *  The following FFT code, functions of ibitr() and fft_standalone(), are
 *  adapted from the FORTRAN program (pp 164) in the book 
 *
 *       The Fast Fourier Transform 
 *          by E. Oran Brigham, 1974.
 *
 *  Reverse the lowest nu bits of integer j.  argument nu must
 *  be 2^n, where n is 1, 2, ..., 32. 
 *
 */
unsigned int ibitr(unsigned int j, unsigned int nu)
{
	unsigned int i, j1, j2, br;
	
	j1 = j,  br = 0;
	for ( i = 0; i < nu; i++) {
		j2 = j1/2;
		br = br * 2 + (j1 - 2 * j2);
		j1 = j2;
	}

	return br;
}


void freq_spectrum(unsigned int N, double *xrealp, double *ximagp, double *psp)
{
	unsigned int i, n;
	
	n = N;
	for ( i = 0; i < n; i++ ) {
		psp[i] = hypot(xrealp[i], ximagp[i]);
	}
}


/**---------------- Standalone  FFT base-2 implementation ------------------ **/

void  fft_gen_cstable(unsigned int N, double *cp, double *sp)
{
	unsigned int i;
	double arg, theta;

	arg = 2.0 * M_PI/(double)N;
	for (i = 0; i < N; i++) { // compute the lookup table for  cos() and sin()
		theta = (double)(i) * arg;
		cp[i] = cos(theta);
		sp[i] = sin(theta);
	}
}

void  fft_gen_cstable_half(unsigned int N, double *cp, double *sp)
{
	unsigned int i;
	double arg, theta;

	arg = M_PI/(double)N;
	for (i = 0; i < N; i++) { // compute the lookup table for  cos() and sin()
		theta = (double)(i) * arg;
		cp[i] = cos(theta);
		sp[i] = sin(theta);
	}
}

void  fft_stalone(unsigned int N, double *cp, double *sp, double *xrealp, double *ximagp)
{
	unsigned int i, j, k, l, n, n2, kn2, nu, nu1, p, br;
	double treal, timag, c, s, t, theta;

	n2 = N >> 1,  k = 0;
	nu  = _ilog(N-1);
	nu1 = nu - 1;

	//sprintf(dbg, "n2 = %d, nu = %d k=%d", n2, nu, k); DBG(dbg); 
	
	for ( l = 0; l < nu; l++ ) {
		again2:
		for ( j = 0; j < n2; j++ ) {
			p = ibitr( k/(1<<nu1), nu);
			c = cp[p],	s = sp[p];	
			kn2 = k + n2;
			treal = xrealp[kn2] * c + ximagp[kn2] * s;
			timag = ximagp[kn2] * c - xrealp[kn2] * s;
			xrealp[kn2] = xrealp[k] - treal;
			ximagp[kn2] = ximagp[k] - timag;
			xrealp[k]  += treal;
			ximagp[k]  += timag;
			k++;
		}
		
		k += n2;
		
		if ( k < N ) 
			goto again2;

		k = 0;
		nu1--;
		n2 >>= 1;
	}
	
	// --- unscrambled the results ---
	for ( k = 0; k < N; k++ ) {
		br = ibitr(k, nu);
		if ( br > k ) {
			treal = xrealp[k];
			timag = ximagp[k];
			xrealp[k] = xrealp[br];
			ximagp[k] = ximagp[br];
			xrealp[br] = treal;
			ximagp[br] = timag;
		}
	}

}


/*
 *   For MFCC FFT only.  
 *   
 *   All the storage pointed to by a double pointer in this routine is pre-
 *   allocated to avoid the overhead of dynamic memory allocation each time 
 *   when this routine is called.  The lookup tables for cos() and sin()
 *   are also pre-computed to avoid duplicate computations.
 *
 *   Input signal is real. We compute 2*N data points in N-sample FFT.
 *
 *   n   is the number of data points to be FFTed
 *   xrealsigp is real data points to be FFTed
 *   cp  is pre-computed cos(x) table for lookup
 *   sp  is pre-computed sin(x) table for lookup
 *   c2p is pre-computed cos(x/2) table for lookup
 *   s2p is pre-computed sin(x/2) table for lookup
 *   xrealp is real part of FFT results 
 *   ximagp is imaginary part of FFT results 
 *   xrp is temporary storage for real part of N samples FFT
 *   xip is temporary storage for imaginary part of N samples FFT
 *   psp is power spectrum results of the FFT analysis. 
 *
 */

void  mfcc_fft_psp(unsigned int n, double *realsigp, double *cp, double *sp, double *c2p,
double *s2p, double *xrealp, double *ximagp, double *xrp, double *xip, double *psp)
{
	unsigned int i, k, N, nbits;
	double c, s, rn, RN, in, IN, t;

	nbits = _ilog(n-1);   // log2() function
	N = 1 << nbits;
	k = N >> 1;           // compute only half of the samples
	
	for ( i = 0 ; i < k; i++ ) {   // k is only half of N.
		xrp[i] = realsigp[2*i  ];  // even-numbered points to (reduced) real part
		xip[i] = realsigp[2*i+1];  // odd-numbered  points to (reduced) imaginary part
		xrp[i+k] = 0.0;
		xip[i+k] = 0.0;
	}

	fft_stalone(k, cp, sp, xrp, xip);  // FFT

	for ( i = 0; i < k; i++ ) {
		rn = xrp[i],  RN = xrp[k-i], c = c2p[i]; 
		in = xip[i],  IN = xip[k-i], s = s2p[i]; 
		if ( i == 0 ) {
			RN = xrp[i],  IN = xip[0];
		}

		xrealp[i] = ( rn + RN + c * (in + IN) - s * (rn - RN) )/2.0;
		ximagp[i] = ( in - IN - s * (in + IN) - c * (rn - RN) )/2.0;
	}

	// the power spectrum for the first N/2 points
	freq_spectrum(k, xrealp, ximagp, psp);   
	
	// The FFT of a real signal is symmetric about N/2. Therefore,
	// usually we compute only N/2 points.  If the remaining N/2 
	// points are needed, just replicate it.
	if ( 1 ) { 
		for ( i = 0; i < k; i++ ) {
			xrealp[N-i-1] = xrealp[i];
			ximagp[N-i-1] = ximagp[i];
			   psp[N-i-1] = psp[i];
		}
	}

}


/** ------------------- FFT: DST and DCT -------------------- **/

void  dstdct(unsigned int m, double *xp, double *yp, unsigned int dctflag, unsigned int type)
{
	unsigned int k, j;
	double c;

	if ( dctflag ) {
		switch( type ) {
		case 2:
			for ( k = 0; k < m; k++ ) {
				yp[k] = 0.0;
				for ( j = 0; j < m; j++ ) {
					c = (double)(k * (2*j+1)) / (double)(2*m);
					yp[k] += 2* cos(c * M_PI) *xp[j];
				}
			}
		break;

		case 3:
			for ( k = 0; k < m; k++ ) {
				yp[k] = 0.5 * xp[0] * 2;
				for ( j = 1; j < m; j++ ) {
					yp[k] += 2* cos( (2*k+1) * j * M_PI/(double)(2*m)) *xp[j];
				}
			}
		break;

		case 4:
			for ( k = 0; k < m; k++ ) {
				yp[k] = 0.0;
				for ( j = 0; j < m; j++ ) {
					yp[k] += cos( (2*k+1)* (2*j+1) * M_PI/(double)(2*2*m)) *xp[j];
				}
			}
		break;

		}
	
	} else {

		switch( type ) {
		case 2:
			for ( k = 1; k <= m; k++ ) {
				yp[k-1] = 0.0;
				for ( j = 1; j <= m; j++ ) {
					yp[k-1] += sin(k * (2*j-1) * M_PI/(double)(2*m)) *xp[j];
				}
			}
		break;
		
		case 3:
			for ( k = 1; k <= m; k++ ) {
				yp[k-1] = 0.0;
				for ( j = 0; j < m; j++ ) {
					yp[k-1] += sin((2*k-1)*j * M_PI/(double)(2*m)) *xp[j];
				}
			}
		break;

		case 4:
			for ( k = 1; k <= m; k++ ) {
				yp[k-1] = 0.0;
				for ( j = 0; j < m; j++ ) {
					yp[k-1] += sin( (2*k-1) * (2*j-1) * M_PI/(double)(2*m)) *xp[j];
				}
			}
		break;
		}
	}
}

/** ------------ FFT: DST and DCT end----------------- **/


/** ========================  NN Begin ====================== **/
//#define  NN_USE_SIGMOID   1   // 1 to choose sigmoid() as activation function
#define    NN_USE_SIGMOID  0   // 0 to choose tanh() as activation function

struct nn_s {
	unsigned char *dscp;
	unsigned int  
		val_flag,    // flag 1 indicates that there is validation data set to be processed
		pred_flag,
		num_calls,   // The number of the nn cost function gets called.
		num_classes, // The number of classes if using classification
		num_dims, num_layers, prwidth,
	    vec_dim, num_lps,  num_labels,
		*dim_lp,  *unit_layer_lp,  // list of units in each layer
		batch_size, cur_batch_off, num_samples, alg_no;
	double cost, lambda, initwt_ub, initwt_lb;

	struct matrix_op_s  *mop;
	struct nn_layer_s  **lpp;
	struct nlnopt_s     *nlp;

	struct matrix_dsc_s 
		*tmxp,                // target labels, for comparison with 
		*Xmxp, *ymxp,         // input training data and their labels
		*val_Xmxp, *val_ymxp, // validation data and their labels
		*rmxp;  // computed labels for validation labels, from last_lp->val_amxp;              
	unsigned int num_Xrows, num_Xcols, num_yrows, num_ycols,
		num_val_Xrows, num_val_Xcols, num_val_yrows, num_val_ycols;
};



//------------------- NN functions -----------------
struct nn_s *nn_init(struct na_params_s *p);
void   nn_fini(struct nn_s *npp);
void   nn_ops3(struct nn_s *nnp);
void   nn_dump_model(struct nn_s *nnp);
void   nn_physionet_data(struct nn_s *nnp);

unsigned int nn_num_calls(struct nn_s *nnp);
unsigned int nn_num_layers(struct nn_s *nnp);
unsigned int nn_vector_dim(struct nn_s *nnp);
unsigned int nn_batch_size(struct nn_s *nnp);
unsigned int nn_cur_batch_off(struct nn_s *nnp);
unsigned int nn_num_lps(struct nn_s *nnp);
unsigned int nn_num_samples(struct nn_s *nnp);

void   nn_vecmat_dim(struct nn_s *nnp);
void   nn_vecmat_map(struct nn_s *nnp);
void   nn_randwt_bounds(struct nn_s *nnp);
//unsigned int nn_vector_size(unsigned int nl, unsigned int *ap);
void   nn_ff_layers(struct nn_s *nnp);
void   nn_bp_layers(struct nn_s *nnp);
double nn_cost(struct nn_s *nnp);
double nn_funcval(struct nn_s *nnp,  double alpha, double *df);
void   nn_load_batchdata(struct nn_s *nnp);
void   nn_nlnopt2(struct nn_s *nnp);
double nn_nlnopt_init(struct nn_s *nnp, double *s);
double nn_nlnopt_nextiter(struct nlnopt_s *nlp);
void   nn_nlnopt_x2x1(struct nn_s *nnp);
void   nn_mx_conf(struct nn_s *nnp, unsigned int type);
void   nn_load_weights_biases(struct nn_s *nnp);
void   nn_init_weights_biases(struct nn_s *nnp);
void   nn_layers_write(struct nn_s *nnp);
unsigned int *nn_read_cfg(struct nn_s *nnp);
void   nn_bp_layer_lp(struct nn_layer_s *lp, struct nn_layer_s *lp2);
void   nn_bp_output(struct nn_s *nnp);
struct nn_layer_s *nn_last_lp(struct nn_s *nnp);
unsigned int nn_num_labels(struct nn_s *nnp);
void   nn_tlabels_raw2int(struct nn_s *nnp, struct matrix_dsc_s *std_mxp, struct matrix_dsc_s *raw_label_mxp);
unsigned int nn_is_multiclass(struct nn_s *nnp);
void   nn_val_layers(struct nn_s *nnp);
void   nn_ff_layer(struct nn_layer_s *lp);
void   nn_bp_dbmxp(struct nn_layer_s *lp);
void   nn_bp_weight_gradient(struct matrix_dsc_s *zmxp, struct matrix_dsc_s *dmxp, struct matrix_dsc_s *dwmxp);

double nn_prec(struct nn_s *nnp, struct matrix_dsc_s *std_mxp);
void   nn_ops_prec(struct nn_s *nnp);

double nlnopt_alpha(double f1, double f2, double f3, 
	                double s2, double s3, double z3);
double nlnopt_cubic_fit(double z3, double f2, double f3, double s2, double s3);
double nlnopt_quad_fit(double z3, double f2, double f3, double s3);

void   nn_check_grad(struct nn_s *nnp);
void   nn_finite_num_diff(struct nn_s *nnp);
void   nn_grad_relative_diff(struct nn_s *nnp);
void   nn_dbg_init_weights(struct nn_s *nnp);
void   nn_dbg_init_mxp(struct matrix_dsc_s *mxp);
void   nn_dbg_init_Xy(struct nn_s *nnp);
void   nn_dbg_init_y(struct nn_s *nnp);
struct nn_layer_s **nn_get_lpp(struct nn_s *nnp);
void   nn_netinfo_show(struct nn_s *nnp);
void   nn_dbg_dump_model(struct nn_s *nnp);
struct matrix_dsc_s *nn_layer_mx_show(struct nn_s *nnp, unsigned int layer,  char *midp);


double nn_l2_reg(struct nn_s *nnp);
void   nn_nl_show(struct nn_s *nnp);
void   nn_mx_show(struct nn_s *nnp);
void   nn_draw_hmark(struct nn_s *nnp, char *hp);
void   nn_draw_dline(struct nn_s *nnp, unsigned type);


void   run_nn(struct na_params_s *p)
{
	struct nn_s *nnp;
	struct mfcc_s *mfccp;
	struct matrix_dsc_s *ccmxp;
	char wf[] = "/home/mark/Downloads/cs/ml/mit/contests/2016/validation/a0001.wav";

	// run mfcc routine to extract the feature
	mfccp = mfcc_init(p);
	
	mfcc_ops(mfccp);
	
	ccmxp = nn_feat_extract(mfccp, wf);
	
	mfcc_fini(mfccp);
	
	nnp = nn_init(p);
	
	matrix_add_mxp(MOP(nnp), ccmxp);
	nn_netinfo_show(nnp);
	
	nn_fini(nnp);
}


/*
 *    Use the obtained NN model to predict.
 *
 */
struct nn_s *nn_pred_init(struct na_params_s *p);
//int    nn_pred(struct nn_s *nnp);
double    nn_pred(struct nn_s *nnp);
double    physionet_get_label(struct matrix_dsc_s *mxp);
//int    physionet_get_label(struct matrix_dsc_s *mxp);
void   nn_pred_fini(struct nn_s *nnp);

void run_nn_pred(struct na_params_s *p)
{
	int lbl;
	double v, thresh;
	struct nn_s *nnp;
	char ans[] = "answers.txt";
	FILE *fp;

	nnp = nn_pred_init(p);
	
	//lbl = nn_pred(nnp);
	v = nn_pred(nnp);
	thresh = NN_USE_SIGMOID == 1 ? 0.5 : 0.0;
	lbl = v > thresh ? 1 : ( NN_USE_SIGMOID == 1 ? 0.0 : -1 );

	nn_pred_fini(nnp);

	fp = fopen(ans, "a+");
	if ( fp ) {
		if ( 0 )
			fprintf(fp, "%s,%d %8.3g\n", p->dfname, lbl, v);
		else
			fprintf(fp, "%s,%d\n", p->dfname, lbl);

		fclose(fp);
	} else {
		sprintf(dbg, "Error to open file %s: errno = %d. Please check.",
		p->dfname, errno); DBG(dbg); 
	}

}

struct nn_s *nn_pred_init(struct na_params_s *p)
{
	unsigned int i, j, n, nr, nc, *ll, *ip;
	struct nn_s *nnp;
	struct nn_layer_s **lpp, *lp, *lp2;
	
	struct mfcc_s *mfccp;
	struct matrix_op_s *mop;
	struct matrix_dsc_s  *ccmxp, *data_mxp = p->mxp;
	char  wbuf[BUF_LEN], bbuf[BUF_LEN], zbuf[BUF_LEN], fbuf[BUF_LEN],
		//wf[] = "/home/mark/Downloads/cs/ml/mit/contests/2016/validation/a0007.wav",
		wf_dsc[] = "%s.wav", wf_dsc2[] = "validation/%s.wav";
	
	sprintf(fbuf, wf_dsc, p->dfname);
	if ( !nau_file_size(fbuf) )
		sprintf(fbuf, wf_dsc2, p->dfname);
		
	mfccp = mfcc_init(p);
	mfcc_ops(mfccp);
	ccmxp = nn_feat_extract(mfccp, fbuf);
	mfcc_fini(mfccp);
	
	// remove the last column for label
	matrix_resize(ccmxp, MXROWS(ccmxp), MXCOLS(ccmxp)-1); 
	data_mxp = ccmxp;

	nnp = CALLOC(1, struct nn_s);
	nnp->mop = mop = matrix_op_init2("ntuml");
	nnp->dscp = strdup("Physionet 2016 Challenge prediction");
	ll = nn_read_cfg(nnp);   // read in the config data for NN network
	
	n = nn_num_layers(nnp);
	nnp->num_lps = n - 1;

	nnp->lpp = lpp = MALLOC(n, struct nn_layer_s *);
	for ( i = 0; i < n; i++ ) {
		lpp[i] = lp = CALLOC(1, struct nn_layer_s);
		
		nr = ll[i],  nc = ll[i+1];
		
		if ( !i ) {
			// This is the input feature data whose label is to be predicted.
			lp->zmxp = data_mxp;  
		} else {
			sprintf(zbuf, "z%d", i); 
			lp->zmxp = matrix_create2(mop, zbuf, 1, nr) ;
		}


		// And load the model...
		if ( i < (n-1) ) {
			sprintf(wbuf, "w%d", i); 
			sprintf(bbuf, "b%d", i); 
			if ( 0 ) {
				lp->wmxp = matrix_create2(mop, wbuf, nr, nc);
				lp->bmxp = matrix_create2(mop, bbuf, 1,  nc);
				matrix_load_file(lp->wmxp, wbuf);
				matrix_load_file(lp->bmxp, bbuf);
			} else {
				lp->wmxp = mop2mxp_by_str(mop, wbuf);
				lp->bmxp = mop2mxp_by_str(mop, bbuf);
			}
		}
	
	}
	
	for ( i = 0; i < n-1; i++ ) {
		lp = lpp[i];
		lp2 = lpp[i+1];
		lp->amxp = lp2->zmxp;
	}


	return nnp;
}

void nn_pred_fini(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nn_layer_s **lpp, *lp;
	
	n = nn_num_layers(nnp);
	lpp = nn_get_lpp(nnp);

	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		free(lp);
	}

	matrix_op_fini(nnp->mop);
	free(nnp->lpp);
	free(nnp->dscp);
	free(nnp->unit_layer_lp);
	free(nnp);
}

//int  nn_pred(struct nn_s *nnp)
double  nn_pred(struct nn_s *nnp)
{
	int lbl;
	double v;

	struct nn_layer_s *lp;
	struct matrix_dsc_s *amxp;
	
	nn_ff_layers(nnp);
	lp = nn_last_lp(nnp);
	amxp = lp->amxp;
	//lbl = physionet_get_label(amxp);
	v = physionet_get_label(amxp);

	//return lbl; 
	return v; 
}


/*
 *   We are using sigmoid() function to evaluate the final
 *   label.  If it is greater than  0.5, return an integer 1. 
 *   Otherwise, return -1 (natively it should be 0 but as required
 *   by physionet, use -1.)  
 *
 *   There are no "uncertain" labels (0). Any prediction errors are 
 *   supposed to be corrected via further NN training and/or the 
 *   improvement of the algorithm of feature extraction.
 *
 */
//int  physionet_get_label(struct matrix_dsc_s *mxp)
double  physionet_get_label(struct matrix_dsc_s *mxp)
{
	int lbl;
	double v, **rpp, *rp;

	rpp = MXRPP(mxp);   // The computed label is in element (0, 0) of mxp.
	rp = rpp[0];
	v = rp[0];

	//lbl = v > 0.5 ? 1:-1;

	//return lbl;
	return v;
}

/*
 *   Read the configuration file into the nn_s{}: 
 *    # layers and the # units in each layer 
 * 
 */
unsigned int *nn_read_cfg(struct nn_s *nnp)
{
	char c, buf[1024], x[] = "nncfg"; 
	unsigned int i, n, *tp, *ip;
	FILE *fp;
	
	fp = fopen(x, "r");
	if (!fp ) {
		sprintf(dbg, "Error to open file \"%s\"", x); DBG(dbg); 
		return NULL;
	}

	while ( fgets(buf, sizeof(buf), fp) ) {
		c = buf[0];
		if ( c && c != '\n' && c != '#' && c != '!' ) {
			break;
		}
	}
	
	fclose(fp);

	tp = ip_per_row(buf);
	n = tp[0];
	
	nnp->num_layers = n;
	ip = nnp->unit_layer_lp =  MALLOC(n, unsigned int); 
	if ( ip ) {
		for ( i = 0; i < n; i++ ) {
			ip[i] = tp[i+1];
			if ( 0 ) {
				sprintf(dbg, " %d ", ip[i]); DBG(dbg); 
			}
		}
	} else {
		sprintf(dbg, "error to allocate memory: errno = %d", errno); DBG(dbg); 
	} 

	free(tp);

	return ip;
}

void   nn_netinfo_show(struct nn_s *nnp)
{
	unsigned int i, j, n, len, width;
	char *msg[] = {
		" SGD batch size", 
		" # of layers",
		" X (training sample) size",
		" y (training label)  size",
		" vector dims (in optimiz.)",
		"                         ",
		"                         ",
		"                         ",
		"                         " 
	};
	struct matrix_dsc_s *Xmxp, *ymxp;

	i = 0,  n = ARRAYSIZE(msg), len = 46;
	Xmxp = nnp->Xmxp, ymxp = nnp->ymxp;

	sprintf(dbg, "%-*s : %5d", len, msg[i++], nn_batch_size(nnp) ); DBG(dbg); 
	sprintf(dbg, "%-*s : %5d", len, msg[i++], nn_num_layers(nnp) ); DBG(dbg); 
	sprintf(dbg, "%-*s : %5d x %-5d", len, msg[i++], MXROWS(Xmxp), MXCOLS(Xmxp) );    DBG(dbg); 
	sprintf(dbg, "%-*s : %5d x %-5d", len, msg[i++], MXROWS(ymxp), MXCOLS( ymxp));    DBG(dbg); 
	
	n = nn_vector_dim(nnp);
	sprintf(dbg, "%-*s : %5d", len, msg[i++], n ); DBG(dbg); 
	//sprintf(dbg, "%-18s : %5d", msg[i++], nn_batch_size(nnp) ); DBG(dbg); 
	
	nn_nl_show(nnp);
	nn_mx_show(nnp);

	//matrix_op_list( MOP(nnp) );
}

void nn_nl_show(struct nn_s *nnp)
{
	unsigned int i, j, n, len, width;
	char buf[BUF_LEN], buf2[BUF_LEN];
	
	
	//sprintf(dbg, "%-18s : %5d", msg[i++], nn_batch_size(nnp) ); DBG(dbg); 

	n = nn_num_layers(nnp);
	width = nnp->prwidth;

	strcpy(dbg, "---------+-input-");
	for ( j = 0; j < width *(n-1); j++ )
		strcat(dbg, ".");
	strcat(dbg, "output---");
	DBG(dbg);


	strcpy(dbg, "  layer  |");
	for ( i = 0; i < n; i++ ) {
		sprintf(buf2, "%*d", width, i);
		strcat(dbg, buf2);
	}
	DBG(dbg);


	strcpy(dbg, " # units |");
	for ( i = 0; i < n; i++ ) {
		sprintf(buf2, "%*d", width, nnp->unit_layer_lp[i]);
		strcat(dbg, buf2);
	}
	DBG(dbg);

	strcpy(dbg, "---------+--");
	for ( i = 0; i < (n+1)*width; i++) {
		strcat(dbg, "-");
	}
	DBG(dbg);

}

void nn_mx_show(struct nn_s *nnp)
{
	nn_draw_hmark(nnp, "---layer---");
	nn_draw_dline(nnp, 2 );  // zmxp
	nn_draw_dline(nnp, 1 );  // amxp
	nn_draw_dline(nnp, 3 );  // wmxp
	nn_draw_dline(nnp, 4 );  // dwmxp
	nn_draw_dline(nnp, 5 );  // bmxp
	nn_draw_dline(nnp, 10);  // val zmxp
	nn_draw_dline(nnp, 11);  // val amxp
	nn_draw_hmark(nnp, "---------");
}

void nn_draw_dline(struct nn_s *nnp, unsigned type)
{
	unsigned int i, n, y;
	char buf[BUF_LEN];
	struct nn_layer_s **lpp, *lp;
	struct matrix_dsc_s *mxp;
	char *mxid[] = { " ", "amxp",   "zmxp",  "wmxp",    "dwxmp",
	            "bmxp",   "dbmxp",  "dmxp",  "d1_mxp",  "db1_mxp",
				"val_zmxp", "val_amxp" };
	
	y = 1;
	n = nn_num_lps(nnp);
	lpp = nn_get_lpp(nnp);
	
	sprintf(dbg, "%*s |", nnp->prwidth, mxid[type]); 

	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		switch( type ) {
		case  1: mxp = lp->amxp;     break;  // activation
		case  2: mxp = lp->zmxp;     break;  // activation 
		case  3: mxp = lp->wmxp;     break;  // weight
		case  4: mxp = lp->dwmxp;    break;  // weight gradient
		case  5: mxp = lp->bmxp;     break;  // bias
		case  6: mxp = lp->dbmxp;    break;  // bias gradient
		case  7: mxp = lp->dmxp;     break;  // delta
		case  8: mxp = lp->d1_mxp;   break;  // previous delta
		case  9: mxp = lp->db1_mxp;  break;  // previous bias
		case 10: mxp = lp->val_zmxp; break;  // validation zmxp
		case 11: mxp = lp->val_amxp; break;  // validation amxp
		default: DBG("Error: no such type of layer matrix...");
				y = 0; break;
		}

		if (!y )
			break;
	
		if ( mxp ) {
			sprintf(buf, "  %7d x %-7d |",  MXROWS(mxp), MXCOLS(mxp));
			strcat(dbg, buf);
		} else {
			DBG("(NULL)");
		} 
	}
	
	if ( y ) {
		DBG(dbg);
	} 

}

void nn_draw_hmark(struct nn_s *nnp, char *hp)
{
	char buf[BUF_LEN];
	unsigned int i, j, n, width;

	n = nn_num_lps(nnp);
	width = nnp->prwidth;
	
	sprintf(buf, "%*s+", nnp->prwidth, hp);
	strcpy(dbg, buf);

	for ( i = 0; i < n; i++ ) {
		for ( j = 0; j < width * 2; j++ ) {
			strcat(dbg, "-");
		} 
		strcat(dbg, "+");
	}
	DBG(dbg);

}

struct nn_s *nn_init(struct na_params_s *p)
{
	unsigned int i, n, m, label_width, *ll;
	struct nn_s         *nnp;
	struct matrix_op_s  *mop;
	struct nn_layer_s   *lp;
	struct nlnopt_s     *nlp;
	

	m = 5;  // m = 100;  // batch size
	label_width = 10;    // for multi-class classification only

	nnp = CALLOC(1, struct nn_s);
	nnp->prwidth = label_width;
	nnp->num_classes = 1;  // no. of classes for classification
	
	
	//ll = nnp->unit_layer_lp;
	//nnp->num_labels = nnp->unit_layer_lp[nnp->num_layers-1];
	
	ll = nn_read_cfg(nnp);
	nnp->num_labels = ll[nnp->num_layers-1];

	n = nn_num_layers(nnp);
	
	nnp->num_lps = n - 1;
	
	nnp->dscp = strdup("2016 Physionet Challenge");
	nnp->mop  = mop = matrix_op_init2("ntuml");

	nnp->nlp  = nlp  = CALLOC(1, struct nlnopt_s);
	
	nnp->dim_lp = MALLOC(n, unsigned int);
	vector_dup_int(n, nnp->dim_lp, ll);

	// --------- Read in the training data sets -----------
	//   Different formats of input data sets are resolved in
	//   the routine which reads the corresponding data sets. 
	//   
	//   The read data finally will be pointed to by
	//    
	//     nnp->Xmxp       :  training data
	//     nnp->ymxp       :  training label data
	//     nnp->val_Xmxp   :  validation data
	//     nnp->val_ymxp   :  validataion label data
	//   
	//

	if ( 0 ) {
		// --- read the MNIST training and test data sets ---
		//nn_uot_data(nnp);
		nnp->batch_size = m;
	} else if ( 0 ) {
		// --- some other data sets I  ---

	} else if ( 0 ){
		// --- some other data sets II ---
		
	} else if ( 1 ) {
		// --- physionet 2016 challenge data sets ---
		nnp->lambda = 100000;
		nnp->alg_no = 6;
		nn_physionet_data(nnp);
		nnp->batch_size = MXROWS(nnp->Xmxp);
	} else {
		
	}

	// Create the target mxp for computing the nn cost function.
	if ( 0 ) {
		nnp->tmxp = matrix_create2(mop, "target labels", nnp->batch_size, 1);
	} else {
		nnp->tmxp = matrix_create2(mop, "target labels", 
			nnp->num_yrows, nnp->num_classes);
		if ( nnp->val_flag ) {
			nnp->rmxp = matrix_create2(mop, "validation results",
				MXROWS(nnp->val_ymxp), MXCOLS(nnp->val_ymxp));
		}

		nn_tlabels_raw2int(nnp, nnp->tmxp, nnp->ymxp);
		nn_tlabels_raw2int(nnp, nnp->rmxp, nnp->val_ymxp);
	}

	// ---- do the vector-matrix mapping and create struct nn_layer_s {} ----
	nn_vecmat_map(nnp);
	
	nn_randwt_bounds(nnp);

	return nnp;
}


void  nn_fini(struct nn_s *nnp)
{
	unsigned int  i, n = nnp->num_layers;
	struct nn_layer_s *lp, **lpp;

	free(nnp->dscp);
	free(nnp->unit_layer_lp);  // storage allocated by nn_read_cfg()

	lpp = nnp->lpp;
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
	} 
	free(lpp);

	matrix_op_fini(nnp->mop);
	//free(nnp->rbmp->logfilep);
	//free(nnp->rbmp);
	free(nnp->dim_lp);
	free(nnp->nlp);
}



void  nn_ops3(struct nn_s *nnp)
{
	unsigned int i, j, k, l, m, n, type;

	n = 10;
	for ( i = 1; i < n; i++ ) {
		type = i;
		nn_mx_conf(nnp, type);
	}

	if ( 0 ) {
		// init the weights/biases matrices using the RBM model
		//nn_rbm_training(nnp);
		//nn_rbm_training2(nnp);
	} else if ( 0 ) {
		// load the pre-trained weights/biases matrices
		nn_load_weights_biases(nnp);
	} else {
		// init randomly the weights/biases matrices
		nn_init_weights_biases(nnp);
		//nn_dbg_init_weights(nnp);
	}
	
	/*
	nn_nlnopt2(nnp);

	nn_dump_model(nnp);
	*/
	nn_ops_prec(nnp);
	
}


void nn_dump_model(struct nn_s *nnp)
{
	char b[BUF_LEN], w[BUF_LEN];
	unsigned int i, n;
	struct nn_layer_s *lp, **lpp;

	n = nn_num_lps(nnp);
	lpp = nn_get_lpp(nnp);
	for ( i = 0; i < n; i++) {
		lp = lpp[i];
		sprintf(w, "w%d", i);
		sprintf(b, "b%d", i);
		matrix_write_file(w, lp->wmxp);
		matrix_write_file(b, lp->bmxp);
	}

}


/*
 *   
 *   Compute the accuracy of the model obtained.  The prediction precision 
 *   for training data set and validation set is computed separately.
 *
 */

void nn_ops_prec(struct nn_s *nnp)
{
	double prec;
	struct nn_layer_s *lp;

	lp = nn_last_lp(nnp);
	prec = nn_prec(nnp, nnp->tmxp);
	sprintf(dbg, "training   prec = %g", prec); DBG(dbg); 

	nnp->pred_flag = 1;
	prec = nn_prec(nnp, nnp->rmxp);
	sprintf(dbg, "validation prec = %g", prec); DBG(dbg); 
	nnp->pred_flag = 0;
	
}

/*
 *   std_mxp contains the known good labels for the data to be
 *   predicted.  The computed labels, depending on whether it
 *   is training or validation, will be in the last layer of the NN,
 *   lp->amxp (training) or lp->val_amxp (validation). 
 *
 */
double nn_prec(struct nn_s *nnp, struct matrix_dsc_s *std_mxp)
{
	unsigned int i, j, m, n, cnt;
	struct nn_layer_s *lp, **lpp;
	struct matrix_dsc_s *ymxp;
	double **srpp, *srp, **yrpp, *yrp, s, t, prec;

	lp = nn_last_lp(nnp);  
	nn_ff_layers(nnp);
	
	if ( nnp->val_flag && nnp->pred_flag ) {
		ymxp = lp->val_amxp;	
	} else {
		ymxp = lp->amxp;
	}

	s =  0.5;   // activation threshold: sigmoid, 0.5; tanh, 0.0;

	m =  MXROWS(ymxp);    srpp = MXRPP(std_mxp);  // known good labels
	n =  MXCOLS(ymxp);    yrpp = MXRPP(ymxp);     // computed/predicted labels
	
	// Now compare the results.
	cnt = 0;
	for ( i = 0; i < m; i++ ) {
		srp = srpp[i];
		yrp = yrpp[i];
		for ( j = 0; j < n; j++ ) {
			if ( yrp[0] >= s )
				t = 1.0;
			else
				t = 0.0;
			
			if ( t == srp[0] )
				cnt++;
		}
	}

	prec = (double)cnt/(double)m;
	
	return prec;
}


/*----  Create the (extracted) training and validation matrixes 
 *        from the extracted feature data sets            ----*/
void  nn_physionet_data(struct nn_s *nnp)
{
	char tbuf[BUF_LEN], vbuf[BUF_LEN];
	unsigned int m, n, alg_no = nnp->alg_no;
	struct matrix_op_s *mop = MOP(nnp);
	struct matrix_dsc_s *Xmxp, *ymxp, *val_Xmxp, *val_ymxp, *train_mxp, *val_mxp; 

	alg_no = nnp->alg_no;
	sprintf(tbuf, "data%d_train", alg_no);
	sprintf(vbuf, "data%d_val",   alg_no); 

	train_mxp = mop2mxp_by_str(mop, tbuf);
	val_mxp   = mop2mxp_by_str(mop, vbuf);
	
	m = MXROWS(train_mxp);           n = MXCOLS(train_mxp);
	nnp->Xmxp = Xmxp = ntuml_mxpslice(train_mxp, 0, m, 0, n-1);
	nnp->ymxp = ymxp = ntuml_mxpslice(train_mxp, 0, m, n-1, n);
	matrix_add_mxp(mop, Xmxp);       matrix_update_idstring(Xmxp, "training data");
	matrix_add_mxp(mop, ymxp);       matrix_update_idstring(ymxp, "training label");

	m = MXROWS(val_mxp);             n = MXCOLS(val_mxp);
	nnp->val_Xmxp = val_Xmxp = ntuml_mxpslice(val_mxp, 0, m, 0, n-1);
	nnp->val_ymxp = val_ymxp = ntuml_mxpslice(val_mxp, 0, m, n-1, n);
	matrix_add_mxp(mop, val_Xmxp);   matrix_update_idstring(val_Xmxp, "validation data");
	matrix_add_mxp(mop, val_ymxp);   matrix_update_idstring(val_ymxp, "validation label");

	MXSIZE(train_mxp);    MXSIZE(Xmxp);       MXSIZE(ymxp);
	MXSIZE(val_mxp);      MXSIZE(val_Xmxp);   MXSIZE(val_ymxp);

	nnp->num_Xrows = MXROWS(Xmxp);   nnp->num_Xcols = MXCOLS(Xmxp);
	nnp->num_yrows = MXROWS(ymxp);   nnp->num_ycols = MXCOLS(ymxp);
	
	nnp->num_val_Xrows = MXROWS(val_Xmxp);
	nnp->num_val_Xcols = MXCOLS(val_Xmxp);
	
	nnp->num_val_yrows = MXROWS(val_ymxp);      
	nnp->num_val_ycols = MXCOLS(val_ymxp);

	nnp->val_flag = 1;   // Set the validation flag, so the following
	                     // code will process the validation data set too.
	nnp->pred_flag = 0;
}



/*
 *   Conversion between the formats of label data and NN internal data
 *   For classification, the data is arranged as 
 *          
 *           internal format                     label input
 *      --------------------------               -----------         
 *       0  ...  0   1   0  ...  0                    k
 *      --------------------------               -----------
 *                  k-th element in
 *                  a column vector
 */

/*-- Convert the training label data into NN internal format for classification --*/
void  nn_tlabels_raw2int(struct nn_s *nnp, struct matrix_dsc_s *std_mxp, struct matrix_dsc_s *raw_label_mxp)
{
	unsigned int i, j, m, n, k, mc;
	struct matrix_dsc_s *tmxp, *ymxp;
	double **trpp, *trp, **yrpp, *yrp;

	tmxp = std_mxp;         m = MXROWS(tmxp);    trpp = MXRPP(tmxp);
	ymxp = raw_label_mxp;   n = MXCOLS(tmxp);    yrpp = MXRPP(ymxp);
	
	matrix_clear(tmxp);   // Clear the target matrix.
	
	mc = nn_is_multiclass(nnp);

	for ( i = 0; i < m; i++ ) {
		trp = trpp[i];
		yrp = yrpp[i];
		if ( mc ) { 
			//
			// multi-classification, the tmxp has (nnp->columns) columns,
			// and the max numerical class no. is (nnp->num_classes-1)
			//
			k = (unsigned int)(yrp[0]);
			trp[k] = 1.0; // if class k = 0,      1,    2, ..., n
			              //     trp[0] = 1, trp[1]=1, trp[2] = 2, ...
						  //     i.e.  trp[k] = 1.0;
		} else { 
			// Just the binary classification
			k = (unsigned int)(yrp[0]);
			if ( k == 1 )
				trp[0] = 1.0;
		}
	}
}


/**------- See if this is a multi-class classification. -------**/
unsigned int nn_is_multiclass(struct nn_s *nnp) 
{
	unsigned int y;

	y = nnp->num_classes > 1 ? 1 : 0;

	return y;
}


/** --- Load a pre-trained weights and biases ---**/
void  nn_load_weights_biases(struct nn_s *nnp)
{
	unsigned int i, n, v;
	struct matrix_dsc_s *wmxp, *bmxp;
	char *wtfilep,  *bsfilep, wbuf[BUF_LEN], bbuf[BUF_LEN],
		*wtbs[] = {          // Just for test, will be dropped.
		"xxweights0", "xxbias0", 
		"xxweights1", "xxbias1", 
		"xxweights2", "xxbias2", 
		"xxweights3", "xxbias3", 
		"xxweights4", "xxbias4" };
	struct nn_layer_s **lpp, *lp;
	
	v = 0;    // 1: debugging;  0: not debugging

	if ( v ) {
		n = ARRAYSIZE(wtbs)/2;  // This is just for test. 
	} else {
		n = nn_num_layers(nnp)-1;
	}

	sprintf(dbg, "loading pre-trained %d pairs of weights and bias", n); DBG(dbg);

	lpp = nn_get_lpp(nnp);
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		if ( v ) { 
			wtfilep = wtbs[2*i];
			bsfilep = wtbs[2*i + 1];
		} else {
			// 
			// Load the pre-trained model parameter files
			// weight files are "w0", "w1", "w2" ...
			// biase  files are "b0", "b1", "b2" ...
			//
			sprintf(wbuf, "w%d", i);  
			sprintf(bbuf, "b%d", i);
			wtfilep = wbuf;
			bsfilep = bbuf;
		}
		wmxp = lp->wmxp;
		bmxp = lp->bmxp;
		matrix_load_file(wmxp, wtfilep);
		matrix_load_file(bmxp, bsfilep);
		sprintf(dbg, " loaded %44s: (%5d x %5d)", 
			MXID(wmxp), MXROWS(wmxp),  MXCOLS(bmxp) ); DBG(dbg); 
		sprintf(dbg, " loaded %44s: (%5d x %5d)", 
			MXID(bmxp), MXROWS(bmxp),  MXCOLS(bmxp) ); DBG(dbg); 
	}
	

	DBG("Finished loading the model ...");
}

/** --- Init the NN weights and biases --- **/
void  nn_init_weights_biases(struct nn_s *nnp)
{
	unsigned int i, n;
	struct matrix_dsc_s *wmxp, *bmxp;
	struct nn_layer_s **lpp, *lp;
	double lb, ub;


	// lower, upper bound of the initial values
	lb = nnp->initwt_lb,    ub = nnp->initwt_ub;
	n = nn_num_lps(nnp);  // no. of lps in the NN
	sprintf(dbg, "randomly init weights and bias pairs: %d", n); DBG(dbg);
	lpp = nn_get_lpp(nnp);
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		wmxp = lp->wmxp;
		bmxp = lp->bmxp;
		sprintf(dbg, "layer id %d: %s (%d, %d)",
			lp->id, MXID(wmxp), MXROWS(wmxp), MXCOLS(wmxp) ); DBG(dbg); 
		sprintf(dbg, "layer id %d: %s (%d, %d)",
			lp->id, MXID(bmxp), MXROWS(bmxp), MXCOLS(bmxp) ); DBG(dbg); 
		matrix_init_random(wmxp, lb, ub);
		matrix_init_random(bmxp, lb, ub);
	}
	
	DBG("Finished init weights and bias pairs ...");
}

/*
 *   vector-matrix mapping
 *   associate various corresponding matrices in contiguous nn_layer_s {}
 *
 */
void nn_vecmat_map(struct nn_s *nnp)
{
	unsigned int i, j, m, n, nl, nr, idx, idx2, *ap, 
				batch_size, num_dim, num_lps, num_weights;
	char id[BUF_LEN], id2[BUF_LEN], id3[BUF_LEN], buf[BUF_LEN];
	struct nn_layer_s **lpp, *lp, *lp2;
	struct nlnopt_s *nlp;
	struct matrix_op_s  *mop;
	struct matrix_dsc_s *vmxp, *wmxp, *bmxp, *dwmxp, *dbmxp, *amxp, *zmxp, *dmxp;
	double **vrpp, *vrp, *xkp, *gkp;

	mop = MOP(nnp); 

	nl = nn_num_layers(nnp);        // no. of layers in the NN
	num_dim = nn_vector_dim(nnp);   // dim of the vector
	num_lps = nn_num_lps(nnp);
	batch_size = nn_batch_size(nnp);

	// --- 1. Create the vector storage for nonlinear optimization ---
	//          point and its gradient at the point
	//      
	nlp = nnp->nlp;
	
	nr = 8;   // # of rows, the number of chucks of continuous memory storage
	vmxp = matrix_create2(mop, "vector storage", nr, num_dim);
	i = 0,  vrpp = MXRPP(vmxp);
	
	// ----- vector storage used by nonlinear optimization -----
	nlp->xkp  = xkp = vrpp[i++];     nlp->gkp = gkp = vrpp[i++];
	nlp->xk1p = vrpp[i++];           nlp->gk1p      = vrpp[i++];
	nlp->dkp  = vrpp[i++];           nlp->ykp       = vrpp[i++];
	nlp->xk0p = vrpp[i++];           nlp->dk0p      = vrpp[i++];
	
	// --------- parameters for nonlinear optimization ---------
	nlp->tao  = 2;       nlp->rho  = 0.01;     nlp->sigma = 0.5; 
	nlp->tao1 = 9.0;     nlp->tao2 = 0.1;      nlp->tao3  = 0.9;
	nlp->EXT  = 3.0;     nlp->INT  = 0.1;      nlp->h     = 1;
	nlp->h0   = 0.1;     nlp->x0   = -2;       nlp->f0    = 4;
	nlp->num_dims = num_dim;
	// 
	// --- 2. Map the vectors the matrixes of weight and gradients---
	//

	ap = nn_wmxp_params(nl, nnp->dim_lp, &m);
	
	// --- Create weight and bias matrixes ---
	idx  = matrix_arrays(mop, xkp, ap, m/2, 0);  // 0 means non-gradients matrixes
	// --- Create weight and bias gradient matrixes ---
	idx2 = matrix_arrays(mop, gkp, ap, m/2, 1); // 1 means gradients matrixes

	nnp->lpp = lpp = MALLOC(num_lps,  struct nn_layer_s *);

	sprintf(dbg, " # layers (struct nn_layer_s{} ) = %d and num_lps = %d  # num_dim = %d",
		nl, num_lps, num_dim); DBG(dbg); 
	
	/*
	 *  Create the structure for each layer and initialize the weight 
	 *  and the weight delta matrix for each layer structure.  Create 
	 *  a delta matrix in each layer for backprop process too.
	 */ 

	for ( i = 0; i < num_lps; i++ ) {
		lpp[i] = lp = CALLOC(1, struct nn_layer_s);
		lp->id = i;
		
		lp->wmxp  = wmxp  = MOP2MXP(mop, idx  + 2*i    );
		lp->bmxp  = bmxp  = MOP2MXP(mop, idx  + 2*i + 1);

		lp->dwmxp = dwmxp = MOP2MXP(mop, idx2 + 2*i    );
		lp->dbmxp = dbmxp = MOP2MXP(mop, idx2 + 2*i + 1);
		
		if ( nnp->val_flag ) {  // This val_flag is set if validation process is required.
			lp->val_amxp = matrix_create2(mop, "validation data",  
				nnp->num_val_Xrows, MXCOLS(lp->wmxp) ); 
		}

		if ( 0 ) {
			sprintf(dbg, "layer %d--%d weights (%4d x %4d)",
			  i, i+1, MXROWS(wmxp), MXCOLS(wmxp) );  DBG(dbg); 
			sprintf(dbg, "layer %d--%d bias    (%4d x %4d)",
			  i, i+1, MXROWS(bmxp), MXCOLS(bmxp) );  DBG(dbg); 
			sprintf(dbg, "layer %d--%d wt grad.(%4d x %4d)",
		  	  i, i+1, MXROWS(dwmxp), MXCOLS(dwmxp) ); DBG(dbg); 
			sprintf(dbg, "layer %d--%d bs grad.(%4d x %4d)",
		  	  i, i+1, MXROWS(dbmxp), MXCOLS(dbmxp) ); DBG(dbg); 
		}
	}
	
	if ( nnp->val_flag ) {
		lp = lpp[0];
		lp->val_zmxp = nnp->val_Xmxp;   // This is the input of the validation data
		for ( i = 1; i < num_lps; i++ ) {
			lp2 = lpp[i];
			lp  = lpp[i-1];
			lp2->val_zmxp = lp->val_amxp;
		}
	}


	/*
	 *  Create an activation matrix (A) for each layer.
	 *  Now associate each A in layer(l) with Z in layer (l+1)
	 *  except the first layer in the NN network.  For example,
	 *
	 *           |           |           |          |          |
	 *  (0)  (1) | (1)   (2) |  (2)  (3) |  (3) (4) | (4)  (5) | 
	 *  z     a  |  z     a  |   z    a  |  z    a  |  z    a  |
	 *  ^        |           |           |          |       ^  |
	 *  |        |           |           |          |       |  |
	 *  |    (1) | (1)   (2) |  (2)  (3) |  (3) (4) | (4)  (5) | 
	 *  |     d  |  d1    d  |   d1   d  |  d1   d  | d1    d  |
	 *  |     +-----+     +------+    +-----+    +----+     |  |
	 * input                                              output
	 *
	 */
	
	// This is the input data matrix,
    // i.e. either the SGD or mini-batch GD matrix
	
	for ( i = 0; i < num_lps; i++ ) {
		lp = lpp[i];
		wmxp = lp->wmxp;
		n = MXCOLS(wmxp);
		sprintf(id, "A(%d)<-->Z(%d) layers %d<-->%d", i, i+1, i, i+1);
		lp->amxp = amxp = matrix_create2(mop, id, batch_size, n);
		
		sprintf(dbg, "A/Z layers %d--%d ... matrix (%4d x %-4d)", 
			i, i+1, MXROWS(amxp), MXCOLS(amxp) );  DBG(dbg);
		
	/* 
	 *  Now associate each A in layer(l) with Z in layer (l+1)
	 *  except the first layer in the NN network.
	 */
		if ( i != (num_lps -1) ) {
			lp2 = lpp[i+1];
			lp2->zmxp = lp->amxp;
		}
	}
	
	lp = lpp[0];  // Set up the input matrix (raw data) here. 
	
	if ( 0 ) {
		lp->zmxp = mop2mxp_by_str(mop, "train-images-idx3-ubyte");
	} else if ( 1 ) {
		// for physionet 2016 challenge extracted feature data
		lp->zmxp = nnp->Xmxp;
		
	} else {
		lp->zmxp = matrix_create_rowp_only("NN input data Z layer 0", batch_size, 
			MXROWS(lp->wmxp) );
		matrix_add_mxp(mop, lp->zmxp);
	}
	
	if ( 1 )   // 1 if print verification
	  for ( i = 0; i < num_lps; i++ ) {
		lp  = lpp[i];
		sprintf(dbg, " Verifying Z in layer %d", i); DBG(dbg); 
		MXSIZE(lp->wmxp);
		MXSIZE(lp->zmxp);
		MXSIZE(lp->amxp);
		DBG("====================================");
	  }

	/*
	 *   Delta matrices:
	 *   1.  The delta matrix has the same dimension as the 
	 *        activation matrix in the same layer.
	 *   2.  The last delta matrix, i.e. the one at the output
	 *        layer, is computed separately.
	 */

	for ( i = 0; i < num_lps; i++ ) {
		lp  = lpp[i];
		dmxp = lp->amxp;

		sprintf(id3, "delta for layer %d +1", i);  
		lp->dmxp = matrix_dup2(id3, dmxp);
		matrix_add_mxp(mop, lp->dmxp);
	}
	
	/*
	 * Associate the delta matrix in layer (l+1) to that in layer l.
	 *           |           |           |          |          |
	 *  (0)  (1) | (1)   (2) |  (2)  (3) |  (3) (4) | (5)  (5) | 
	 *  |        |           |           |          |          |
	 *  |d    d1 |  d     d1 |   d    d1 |  d    d1 | d <--+ d1|
	 *  |     +-----+     +------+    +-----+    +----+    | N |
	 *  |                                                  | U
	 *  |                                                  | L
	 *  |                                                  | L
	 *                                           computed specially
	 */
	
	for ( i = 1; i < num_lps; i++ ) {
		lp  = lpp[i-1];
		lp2 = lpp[i];
		lp->d1_mxp  = lp2->dmxp;
		lp->db1_mxp = lp2->dbmxp;
	}

	free(ap);
}

/**----- Set the bounds for initial weights/biases if NN chooses 
 *         to initialize weights/biases randomly         ---- **/
void   nn_randwt_bounds(struct nn_s *nnp)
{
	unsigned int i, n, num_ins, num_outs;
	double q, eps;
	
	n = nn_num_layers(nnp) - 1;
	num_ins = nnp->unit_layer_lp[0];
	num_outs = nnp->unit_layer_lp[n];
	
	q = 6.0/(double)(num_ins + num_outs);
	eps = sqrt(q);

	nnp->initwt_ub =  eps;
	nnp->initwt_lb = -eps;

}


void   nn_mx_conf(struct nn_s *nnp, unsigned int type)
{
	unsigned int i, n, y;
	struct nn_layer_s *lp, **lpp;
	struct matrix_dsc_s *mxp;
	char *mxid[] = { "amxp", "zmxp", "wt. wmxp", "wt. gradient dwxmp", "bias bmxp", 
	"bias gradient dbmxp", "delta dmxp", "prev delta d1_mxp",  "db1_mxp"};
	n = nn_num_lps(nnp);
	lpp = nnp->lpp;

	y = 1;
	
	sprintf(dbg, " ----- type (%d) %12s-----", 
		type, mxid[type-1] ); DBG(dbg);
	
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		switch( type ) {
		case 1:	mxp = lp->amxp;    break;  // activation
		case 2:	mxp = lp->zmxp;    break;  // activation 
		case 3:	mxp = lp->wmxp;    break;  // weight
		case 4:	mxp = lp->dwmxp;   break;  // weight gradient
		case 5:	mxp = lp->bmxp;    break;  // bias
		case 6:	mxp = lp->dbmxp;   break;  // bias  gradient
		case 7:	mxp = lp->dmxp;    break;  // delta
		case 8:	mxp = lp->d1_mxp;  break;  // previous delta
		case 9:	mxp = lp->db1_mxp; break;  // previous bias
		default: DBG("Error: no such type of layer matrix...");
				y = 0; break;
		}

		if (!y )
			break;
	
		if ( mxp ) {
			sprintf(dbg, " layer %2d: %24s  (%4d x %-4d)", i, 
				MXID(mxp), MXROWS(mxp), MXCOLS(mxp) ); DBG(dbg); 
		} else {
			DBG("(NULL)");
		} 
	}
	DBG("");

}

unsigned int nn_num_lps(struct nn_s *nnp)
{
	unsigned int num_lps = nnp->num_lps;
	if ( !num_lps ) {
		sprintf(dbg, "Warning: NN number of lps (lpp[]) = %d not set", num_lps);
		DBG(dbg); 
	}
	return num_lps;
}

unsigned int nn_num_layers(struct nn_s *nnp)
{
	unsigned int nl = nnp->num_layers;
	if ( !nl ) {
		sprintf(dbg, "Warning: NN number of layers = %d not set", nl);
		DBG(dbg); 
	}
	return nl;
}

/**---- Get the number of calls that the objective function is evaluated ----**/
unsigned int nn_num_calls(struct nn_s *nnp)
{
	return nnp->num_calls;
}

unsigned int nn_num_samples(struct nn_s *nnp)
{
	unsigned int num_samples = nnp->num_samples;
	
	if ( !num_samples ) {
		sprintf(dbg, "Warning: NN number of samples = %d", num_samples);
		DBG(dbg); 
	}
	return num_samples;
}

unsigned int nn_batch_size(struct nn_s *nnp)
{
	unsigned int batch_size = nnp->batch_size;
	if ( !batch_size ) {
		sprintf(dbg, "Warning: NN batch size = %d", batch_size);
		DBG(dbg); 
	}
	return batch_size;
}

unsigned int nn_cur_batch_off(struct nn_s *nnp)
{
	unsigned int cur_batch_off = nnp->cur_batch_off;
	
	if ( !cur_batch_off ) {
		sprintf(dbg, "Warning: NN batch off = %d", cur_batch_off );
		DBG(dbg); 
	}
	
	return cur_batch_off;
}

struct nn_layer_s **nn_get_lpp(struct nn_s *nnp)
{
	return nnp->lpp;
}

// Compute the size of the vector to allocate for the weight and 
//  bias (one-row) matrixes: arguments:  nl: number of layers;  
//  ap is the number of units in each layer.
unsigned int nn_vector_dim(struct nn_s *nnp)
{
	if ( !nnp->vec_dim )
		nn_vecmat_dim(nnp);

	return  nnp->vec_dim;
}

// --- Compute the dimension of the vector used the nonlinear optimization ---
void  nn_vecmat_dim(struct nn_s *nnp)
{ 
	unsigned int i, nl, m, n, s, *ap;
	
	s = 0;
	nl = nnp->num_layers;
	ap = nnp->dim_lp;

	for ( i = 0; i < nl - 1; i++ ) {
		
		m = ap[i];
		n = ap[i+1];

		s += m * n + n;
	}

	nnp->vec_dim = s;
}

/*
 *  Load training data into layer structure, label 
 *  transformation is also done here. 
 */
void   nn_load_batchdata(struct nn_s *nnp)
{
	unsigned int i, j, m, batch_size, cur_batch_off, t;
	struct matrix_dsc_s *Xmxp, *zmxp, *ymxp, *tmxp;
	struct nn_layer_s *lp, **lpp;
	double **zrpp, **Xrpp, **trpp, **yrpp, *trp, *yrp;

	cur_batch_off = nn_cur_batch_off(nnp);
	batch_size    = nn_batch_size(nnp);
	
	lpp  = nnp->lpp;
	lp   = lpp[0];       // <-- the input layer --

	Xmxp = nnp->Xmxp;     Xrpp = MXRPP(Xmxp);
	zmxp = lp->zmxp;      zrpp = MXRPP(zmxp);
	ymxp = nnp->ymxp;     yrpp = MXRPP(ymxp);
	tmxp = nnp->tmxp;     trpp = MXRPP(tmxp);
	m    = MXCOLS(tmxp);

	for ( i = 0; i < batch_size; i++ ) {
		zrpp[i] = Xrpp[i + cur_batch_off];
		yrp = yrpp[i + cur_batch_off];
		t   = (unsigned int)(yrp[0]);
		trp = trpp[i];

		for ( j = 0; j < m; j++ ) {
			
			if ( j == t ) {
				trp[j] = 1.0;
			} else {
				trp[j] = 0.0;
			}
		}
	}

	nnp->cur_batch_off += batch_size;
}


/** ===================== NN  feedforward process (FF) ================= **/
void   nn_ff_layers(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nn_layer_s *lp, **lpp;
	
	n = nn_num_lps(nnp);
	lpp = nn_get_lpp(nnp);
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		
		if ( nnp->val_flag && nnp->pred_flag)
			lp->use_val = 1;
		
		nn_ff_layer(lp);
		
		lp->use_val = 0;
	}
}

/*-- Use the obtained model to compute the classification labels --*/

void   nn_val_layers(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nn_layer_s *lp, **lpp;

	n = nn_num_lps(nnp);
	lpp = nn_get_lpp(nnp);
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		lp->use_val = 1;   // Set the use_val flag to indicate this is the validation process.
		nn_ff_layer(lp);
		lp->use_val = 0;   // Clear the use_val flag.
	}
}



/** ================ NN  backpropagation process (BP) ================ **/
//  
//  Implement the BP process
//  Loop thru all the wmxp's to compute  
//	1) the delta matrix at each layer, excluding the input layer
//	2) the weight gradient matrix
//	3) the bias gradient matrix (a one-row matrix)
//	   Note: the bias gradient matrix should be a one-column matrix/vector
//	   mathematically. Here, we use the one-row matrix implementation
//	   just for the convenience of the C programming.
//
void  nn_bp_layers(struct nn_s *nnp)
{
	unsigned int i, j, m, n, num_lps;
	struct nn_layer_s *lp, *lp2, **lpp;
	struct matrix_dsc_s *wmxp, *dwmxp, *bmxp, *dbmxp, *zmxp, *amxp, 
		*d1_mxp, *db1_mxp, *dmxp;

	lpp = nnp->lpp;
	num_lps = nn_num_lps(nnp);
	
	for ( j = 0; j < num_lps; j++ ) {
	
		if ( j == 0 ) {
			if ( 0 ) {
				sprintf(dbg, "BP (output) layer %d", num_lps-1);
				DBG(dbg); 
			}
			nn_bp_output(nnp);
		
		} else {
			
			if ( 0 ) {
				sprintf(dbg, "BP layer %d", i); DBG(dbg); 
			}
			
			i = num_lps - j - 1;   // so i will be (num_lps-1), 
			lp  = lpp[i];	       // (num_lps-2), ..., 2, 1, 0
			lp2 = lpp[i+1];
			nn_bp_layer_lp(lp, lp2);	
		}
	}
}



/** === Compute the delta matrix and the gradients of weights 
 *    (matrix) and bias (one-row matrix) at the layers except
 *    the output one === **/

void  nn_bp_layer_lp(struct nn_layer_s *lp, struct nn_layer_s *lp2)
{
	unsigned int i, j, k, m, n, l, mc;
	struct matrix_dsc_s  *dwmxp, *dbmxp, *dmxp, *wmxp, 
	                     *zmxp, *wmxp2, *dmxp2, *zmxp2;
	double a, t, z, **dbrpp, *dbrp,
		   **drpp,  *drp,  **drpp2, *drp2, **wrpp,  *wrp, 
	       **wrpp2, *wrp2, **zrpp,  *zrp,  **zrpp2, *zrp2;

	 dmxp  = lp->dmxp;     drpp  = MXRPP(dmxp);
	 wmxp  = lp->wmxp;     wrpp  = MXRPP(wmxp);
	 zmxp  = lp->zmxp;     zrpp  = MXRPP(zmxp);
	 
	 dmxp2 = lp2->dmxp;    drpp2 = MXRPP(dmxp2);
	 wmxp2 = lp2->wmxp;    wrpp2 = MXRPP(wmxp2);
	 zmxp2 = lp2->zmxp;    zrpp2 = MXRPP(zmxp2);

	 dwmxp = lp->dwmxp;    
	 dbmxp = lp->dbmxp;    dbrpp = MXRPP(dbmxp);
	 
	 mc = mxmul_check_abTc(dmxp2, wmxp2, dmxp);
	 if ( !mc ) {
	 	DBG(" D(l+1) .T() * W (l+1) = D, matrix dims do not match");
		return;
	 }

	 // clear the bias gradient matrix first
	 matrix_clear(dbmxp);    dbrp = dbrpp[0];
	 
	 //
	 //  The formula for delta (l) is  
	 //      (l+1)    (l+1)        (l+1)      (l)
	 //    (d      *  w.T() ) .* (z     )' = d
	 //  
	
	m = MXROWS(dmxp);       drpp  = MXRPP(dmxp);
	n = MXCOLS(dmxp);       drpp2 = MXRPP(dmxp2);
	l = MXCOLS(dmxp2);      wrpp2 = MXRPP(wmxp2);
	
	for ( i = 0; i < m;  i++ ) {
	 	drp  = drpp[i];   // c
		drp2 = drpp2[i];  // a
		
		zrp2 = zrpp2[i];   // for element-wise product with z (l+1)

		for ( j = 0; j < n;  j++ ) {
	 		t = 0;
			wrp2 = wrpp2[j];   // b
			for ( k = 0; k < l;  k++ ) {
	 			t += drp2[k] * wrp2[k];
	 		}

			a = zrp2[j];      // element-wise product of the derivative
			if ( NN_USE_SIGMOID ) {  // sigmoid'(.)
				drp[j] = t * a * (1.0 - a);  // of the zmxp, use h'(a) = a *(1-a).
			} else {    // tanh'(.)
				drp[j] = t * (1.0 - a *a );  // of the zmxp, use h'(a) = 1- a*a.
			}
			
			dbrp[j] += drp[j];  // accumulate the bias gradient
		}

	}
	
	for ( j = 0; j < n; j++ ) {
		dbrp[j] /= (double)m;  // averaged over all biases
	}

	//
	//       (l)         (l)                   (l)
	//    ( z   .T()  * d    )/batch_size =  dw 
	//
	nn_bp_weight_gradient(zmxp, dmxp, dwmxp);
}



/** === Compute the delta matrix and the gradients of weights 
 *    (matrix) and bias (one-row matrix) at the output layer === **/

void  nn_bp_output(struct nn_s *nnp)
{
	unsigned int i, j, m, n, num_lps, mc;
	struct matrix_dsc_s *mxp, *tmxp, *zmxp, *amxp, *dwmxp, *dbmxp, *dmxp;
	double **trpp, *trp, **zrpp, *zrp, **arpp, *arp, **dwrpp, *dwrp, **drpp, *drp, **dbrpp, *dbrp;
	struct nn_layer_s **lpp, *lp;
	
	/** Compute the delta matrix for the output layer. **/
	 lp = nn_last_lp(nnp);   // get the last (output) layer struct

	 tmxp = nnp->tmxp;     trpp = MXRPP(tmxp);   // target matrix
	 amxp = lp->amxp;      arpp = MXRPP(amxp);   // computed labels
	 dmxp = lp->dmxp;      drpp = MXRPP(dmxp);   
	dbmxp = lp->dbmxp;    dbrpp = MXRPP(dbmxp);   
	dwmxp = lp->dwmxp;     zmxp = lp->zmxp; 
	
	mc = MXSIZE_EQ(tmxp, amxp);
	if ( !mc ) {
		sprintf(dbg, "tmxp %s and amxp %s are not of the same dims",
			MXID(tmxp), MXID(amxp) ); DBG(dbg); 
		MXSIZE(tmxp);  MXSIZE(amxp);
		return ;
	}

	matrix_clear(dbmxp);     dbrp = dbrpp[0];
	m = MXROWS(amxp);           n = MXCOLS(amxp);   
	// ----- Compute the delta matrix of the output layer.-----
	for ( i = 0; i < m; i++ ) {
		drp = drpp[i];
		arp = arpp[i];
		trp = trpp[i];
		for ( j = 0; j < n; j++ ) {
			drp[j] = arp[j] - trp[j];
			dbrp[j] += drp[j];  // Compute gradient of b, accumlate over all rows
		}
	}
	
	for (j = 0; j < n; j++ ) {
		dbrp[j] /= (double)m;
	}

	nn_bp_weight_gradient(zmxp, dmxp, dwmxp);
}



/** ================== NN  feedforward step (FF) ================= **/
void nn_ff_layer(struct nn_layer_s *lp)
{
	unsigned int i, j, k, l, m, n, mc;
	struct matrix_dsc_s *amxp, *wmxp, *bmxp, *zmxp;
	double **arpp, *arp, **wrpp, *wrp, **brpp, *brp, **zrpp, *zrp, 
			accum, sum, a, z, b;
	
	if ( 0 ) {
		sprintf(dbg, "FF layer %d lp->use_val = %d", lp->id, lp->use_val); DBG(dbg); 
	}
	
	if ( lp->use_val ) {
		zmxp = lp->val_zmxp;    amxp = lp->val_amxp;
	} else {
		zmxp = lp->zmxp;        amxp = lp->amxp;    
	}

	wmxp = lp->wmxp;      bmxp = lp->bmxp;
	
	arpp = MXRPP(amxp);   zrpp = MXRPP(zmxp);
	wrpp = MXRPP(wmxp);   brpp = MXRPP(bmxp);
	
	// 
	// zmxp matrix is the same matrix as amxp in the next layer.
	// Since we use sigmoid function, for convenience in BP process,
	// it contains data that have already gone thru the activation
	// (e.g. sigmoid) function.
	//
	//  amxp = sigmoid( zmxp * wmxp + bmxp )
		
	m = MXROWS(amxp);    
	n = MXCOLS(amxp);
	l = MXROWS(wmxp);
	
	// 
	// Sanity check only for matrix multiplication, please comment
	// out this block of code after debugging. 
	//
	mc = mxmul_check_abc(zmxp, wmxp, amxp);

    if ( !mc ) 
        return;
	
	brp = brpp[0];
	for ( i = 0; i < m; i++ ) {
		arp = arpp[i];    // move along rows of amxp
		zrp = zrpp[i];  
		for ( j = 0; j < n; j++ ) {
			
			accum = 0.0;
			
			for ( k = 0; k < l; k++ ) {
				wrp = wrpp[k];
				accum += zrp[k] * wrp[j]; 
			}

			accum += brp[j];          // add bias
			if ( NN_USE_SIGMOID ) {
				arp[j] = sigmoid(accum);  // put thru sigmoid function
			} else {
				arp[j] = tanh(accum);     // put thru hyperbolic function
			}
		}
	}

}




/*
 *  Compute the gradient at layer (l):
 *
 * -------- (zmxp.T() * dmxp = dwmxp ) ---------
 *
 */
void  nn_bp_weight_gradient(struct matrix_dsc_s *zmxp, struct matrix_dsc_s *dmxp, struct matrix_dsc_s *dwmxp) 
{
	unsigned int i, j, k, m, n, l, batch_size, mc;
	double **drpp, *drp, **dwrpp, *dwrp, **zrpp, *zrp, a, z, accum;
	
	mc = mxmul_check_aTbc(zmxp, dmxp, dwmxp);
	if ( !mc )
		return;
	 
	 zrpp = MXRPP(zmxp);       m = MXROWS(dwmxp);
	 drpp = MXRPP(dmxp);       n = MXCOLS(dwmxp);
	dwrpp = MXRPP(dwmxp);      l = MXROWS(zmxp);
	 
	batch_size = MXROWS(zmxp);
	
	if ( 0 ) {
		sprintf(dbg, "m, n, l = %d, %d, %d batch_size =%d", 
			m, n, l, batch_size ); DBG(dbg); 
    	MXSIZE(zmxp);  MXSIZE(dmxp);  MXSIZE(dwmxp);
	}

	for ( i = 0; i < m; i++) {
		dwrp = dwrpp[i];	// move along the rows of dwmxp
		for ( j = 0; j < n; j++) {
			accum = 0.0;
			for ( k = 0; k < l; k++) {
				zrp = zrpp[k];
				drp = drpp[k];
				accum += zrp[i] * drp[j];
			}
			
			// move along the columns of dwmxp
			dwrp[j] = accum / (double) batch_size;  
		}
	}

}

/** =============  Compute the bias gradient dbmxp  ============ **/
void  nn_bp_dbmxp(struct nn_layer_s *lp)
{
	struct matrix_dsc_s *dmxp, *dbmxp; 
	unsigned int i, j, m, n;
	double **dbrpp, *dbrp, **drpp, *drp;

	dmxp  = lp->dmxp;    // delta matrix
	dbmxp = lp->dbmxp;   // bias gradient matrix

	m = MXROWS(dmxp);    drpp  = MXRPP(dmxp);
	n = MXCOLS(dmxp);    dbrpp = MXRPP(dbmxp);
	
	dbrp = dbrpp[0];     // one-row matrix ...
	for ( j = 0; j < n; j++ ) {
		dbrp[j] = 0.0;
	}
	
	for ( i = 0; i < m; i++ ) {
		drp = drpp[i];
		for ( j = 0; j < n; j++ ) {
			dbrp[j] += drp[j];
		}
	}

}


void   nn_layers_write(struct nn_s *nnp)
{
	char buf[64];
	unsigned int i, n;
	struct nn_layer_s **lpp, *lp;

	n = nn_num_lps(nnp);
	lpp = nnp->lpp;
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		sprintf(buf, "zmxp%d", i);   matrix_write_file(buf, lp->zmxp);
		sprintf(buf, "amxp%d", i);   matrix_write_file(buf, lp->amxp);
		sprintf(buf, "wmxp%d", i);   matrix_write_file(buf, lp->wmxp);
		sprintf(buf, "bmxp%d", i);   matrix_write_file(buf, lp->bmxp);
		sprintf(buf, "dwmxp%d", i);  matrix_write_file(buf, lp->dwmxp);
		sprintf(buf, "dbmxp%d", i);  matrix_write_file(buf, lp->dbmxp);
		sprintf(buf, "dmxp%d", i);   matrix_write_file(buf, lp->dmxp);
	}

}

unsigned int nn_num_labels(struct nn_s *nnp)
{
 	return nnp->num_labels; 
}

struct nn_layer_s *nn_last_lp(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nn_layer_s **lpp, *lp;
	
	n = nn_num_lps(nnp);
	lpp = nn_get_lpp(nnp);
	lp = lpp[n-1];

	return lp;
}

struct matrix_dsc_s *nn_layer_mx_show(struct nn_s *nnp, unsigned int layer,  char *midp)
{
	unsigned int i, n, m;
	struct matrix_dsc_s *mxp;
	struct nn_layer_s **lpp, *lp;
	char *mxid[] = {"zmxp", "amxp", "wmxp",  "bmxp", 
	                               "dwmxp", "dbmxp", "dmxp"};

	m = nn_num_lps(nnp);
	
	if ( layer >= m ) {
		sprintf(dbg, "Error layer # %d exceeding max layer #%d", 
			layer, m); DBG(dbg); 
		DBG("Nothing was done.");

		return NULL;
	}

	lpp = nn_get_lpp(nnp);
	lp = lpp[layer];
	
	n = ARRAYSIZE(mxid);
	i = 0;
	if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->zmxp;
	} else if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->amxp;
	} else if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->wmxp;
	} else if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->bmxp;
	} else if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->dwmxp;
	} else if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->dbmxp;
	} else if ( !strcmp(midp, mxid[i++]) ) {
		mxp = lp->dmxp;
	} else {
		mxp = NULL;
	} 

	if ( mxp ) {
		matrix_print(mxp);	
	} else {
		sprintf(dbg, "matrix id was incorrect = %s", midp); DBG(dbg);
		DBG("must be one of \"zmxp\", \"amxp\", \"wmxp\", \"bmxp\", \"dwmxp\", \"dbmxp\", \"dmxp\"");
	}

	return mxp;
}

/*
 * -------- Compute the cost of one full NN pass ( FF + BP) -------
 *  
 */
double  nn_cost(struct nn_s *nnp)
{
	unsigned int i, j, m, n, nl, num_lps;
	struct matrix_dsc_s *tmxp, *amxp;
	struct nn_layer_s *lp, **lpp;
	double **arpp, *arp, **trpp, *trp, cost, e, lambda, reg_cost; 

	lp = nn_last_lp(nnp);
	amxp = lp->amxp;    // This is the computed result.
	tmxp = nnp->tmxp;   
	
	if ( !MXSIZE_EQ(amxp, tmxp) ) {
		sprintf(dbg, "Error: the dims of matrixes %s and %s are not equal.", 
			MXID(amxp), MXID(tmxp) ); DBG(dbg); 
		return -1.0;
	}
	
	m = MXROWS(amxp);    trpp = MXRPP(tmxp);
	n = MXCOLS(amxp);    arpp = MXRPP(amxp);
	
	cost = 0.;
	for ( i = 0; i < m; i++ ) {
		trp = trpp[i];
		arp = arpp[i];
		for ( j = 0; j < n; j++ ) {
			e = arp[j];
			if ( NN_USE_SIGMOID ) {
				// use sigmoid(.)
				if ( trp[j] == 1.0 ) {
					cost += log(e); 
				} else {
					cost += log(1.0 - e); 
				}
			
			} else {  
				// use tanh(.)
				if ( trp[j] == 1.0 ) {
					cost += log(1.0 + e) - log(2.0); 
				} else {
					cost += log(1.0 - e) - log(2.0); 
				}
			}
		}
	}
	
	
	reg_cost = nn_l2_reg(nnp);

	nnp->cost = -cost/(double)m + lambda * reg_cost /(double)(2*m);
	
	return nnp->cost;
}

double nn_l2_reg(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nn_layer_s **lpp, *lp;
	double reg_cost;

	reg_cost = 0.0;
	lpp = nn_get_lpp(nnp);
	n = nn_num_lps(nnp);
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		reg_cost +=  matrix_ewop_sqrtsum(lp->wmxp);
	}

	return reg_cost;
}

/*--- backup copy of the original nn_cost() --- */
double  nn_cost2(struct nn_s *nnp)
{
	unsigned int i, j, m, n, nl, num_lps;
	struct matrix_dsc_s *tmxp, *ymxp;
	struct nn_layer_s *lp, **lpp;
	double **yrpp, *yrp, **trpp, *trp, cost, e; 

	lp = nn_last_lp(nnp);
	ymxp = lp->amxp;
	tmxp = nnp->tmxp;
	
	if ( !MXSIZE_EQ(ymxp, tmxp) ) {
		sprintf(dbg, "Error: the dims of matrixes %s and %s are not equal.", 
			MXID(ymxp), MXID(tmxp) ); DBG(dbg); 
		return -1.0;
	}
	
	m = MXROWS(ymxp);    trpp = MXRPP(tmxp);
	n = MXCOLS(ymxp);    yrpp = MXRPP(ymxp);
	
	cost = 0.;
	for ( i = 0; i < m; i++ ) {
		trp = trpp[i];
		yrp = yrpp[i];
		for ( j = 0; j < n; j++ ) {
			e = yrp[j];
			if ( trp[j] == 1.0 ) {
				cost += log(e); 
			} else {
				cost += log(1.0 - e); 
			}
		}
	}

	nnp->cost = -cost/(double)m;
	
	return nnp->cost;
}

/* 
 *  Set up the initial search direction, the negative of 
 *  the computed gradient vector ( weight/bias matrix) and
 *  compute the initial slope in this direction.
 *  
 */
double  nlnopt_dk0(struct nlnopt_s *nlp)
{
	unsigned int i, n;
	double *dkp, *gkp, s;

    // This is set by nn_vecmat_map() via nn_init().
	n   = nlnopt_num_dims(nlp); 
	dkp = nlp->dkp;   
	gkp = nlp->gkp;   
	
	s = 0;
	for ( i = 0; i < n; i++ ) {
		dkp[i] = -gkp[i];
		s += gkp[i] * gkp[i];
	}
	
	return s;
}

// -----  NN goes thru the nonlinear optimization process  ---- //
//
//  The code for dynamic bracketing via quadratic or cubic interpolation 
//  /extrapolation for line search is based on Carl Rassmussen's
//  minimize.m  MATLAB code. The conjugate gradient descent (direction)
//  alogrithm, which proves more effective, is from the paper
//  	
//  	A New Conjugate Gradient Method with Guaranteed 
//  		Descent and An Efficient Line Search 
//
//  			by William W. Hager and Hongchao Zhang
//
//				2005  SIAM J. of Optimization
//				Vol. 19,  No. 1 pp 170-192
//
//
void  nn_nlnopt2(struct nn_s *nnp)
{
	unsigned int i, j, k, l, m, n, ok;
	struct nlnopt_s *nlp = nnp->nlp;
	double  z1, z2, z3, f1, f2, f3, s1, s2, s3, 
			 sigma, rho, EXT, INT, limit;
	
	m = 200, n = 20;
	
	// 1. Compute the function value at xkp and its partial derivatives, 
	// stored in dkp. xkp is initialized, func val is f0, computed
	// gradient vector is in dkp.
	//
	
	rho = nlp->rho;  sigma = nlp->sigma;
	EXT = nlp->EXT;    INT = nlp->INT;
	
	f1 = nn_nlnopt_init(nnp, &s1);

	z1 = 1. / (1. - s1);
	z3 = -z1;

	for ( i = 0; i < m; i++ ) {
		
		f3 = f1,  s3 = s1,  z3 = -z1;
		f2 = nn_funcval(nnp, z1, &s2);
		ok = 0, limit = -1;

		for ( k = 0; k < 45; k++ ) {
			for ( j = 0; j < n; j++ ) {
				if ( ( f2 > (f1 + z1 * rho * s1) || ( s2 > (-sigma * s1) ) ) ) {
					limit = z1;
					z2 = nlnopt_alpha(f1, f2, f3, s2, s3, z3);

					z2 = max(min(z2, INT * z3), (1-INT) * z3);

					z1 += z2;
				
					f2 = nn_funcval(nnp, z2, &s2);
					z3 -= z2;
				} else if ( s2 > sigma * s1 ) {
					ok = 1;
					break;
				}
			}
		
			if ( !ok ) {  // ---- Try one more time ----
		
				z2 = nlnopt_cubic_ext(z1, z3, f2, f3, s2, s3, limit);
				f3 = f2, s3 = s2, z3 = -z2;
				z1 += z2;

				f2 = nn_funcval(nnp, z2, &s2);
			} else {
			
				s2 = nn_nlnopt_nextiter(nlp);
				z1 *= min(100, s1/s2);
				f1 = f2, s1 = s2;
				break;	
			}
		}
		
		if ( !ok ) {
			DBG("xxxxxxxx line search failed xxxxxxxxx");
		}
	}
	
}


/*
 *   Before entering the main loop of nonlinear optimization, 
 *   the initial data points, pointed to by xkp, must be already 
 *   set or initialized.  This routine does the following tasks:
 *
 *   1)  Run the cost function once with alpha = 0, so we have
 *       the initial gradient vector computed, gkp.
 *   2)  Save a copy of data in xkp to xk1p,
 *   3)  Save a copy of gradient vector in gkp to gk1p,
 *   4)  Save a negative copy of gradient vector in gkp to dkp
 *        because the initial search direction is always the 
 *        steepest descent direction.
 *   5)  Save a scaled copy of gradient in gkp to ykp, 
 *         ykp = -gkp - (-0.5) * gk1p = -0.5*gkp.
 *
 *   Note: in the following operations of nonlinear optimization xkp
 *    and gkp are current data point and its gradient vector while 
 *    gkp and gk1p are previous data point and its gradient vector.
 *
 */

double   nn_nlnopt_init(struct nn_s *nnp, double *s)
{
	unsigned int i, n;
	struct nlnopt_s *nlp = nnp->nlp;
	double f1, s1, *xkp, *gkp, *xk1p, *gk1p, *ykp, *dkp;

	f1 = nn_funcval(nnp, 0, &s1);  // value of cost function
	n = nlnopt_num_dims(nlp);
	
	xkp = nlp->xkp;    xk1p = nlp->xk1p;
	gkp = nlp->gkp;    gk1p = nlp->gk1p;
	ykp = nlp->ykp;    dkp  = nlp->dkp;

	s1 = 0.0;
	for ( i = 0; i < n; i++ ) {
		xk1p[i] = xkp[i];	 // save a copy of data point
		gk1p[i] = gkp[i];	 // save a copy of the gradient vector at data point
		dkp[i]  = -gkp[i];   // initial search direction is -gkp.
		ykp[i]  = -0.5 * gkp[i];
		s1 += dkp[i] * gkp[i];
	}

	*s = s1;  // slope 
	
	return f1;
}

double nn_funcval(struct nn_s *nnp,  double alpha, double *s)
{
	unsigned int i, n;
	double f, *xkp, *xk1p, *gk1p, *gkp, *dkp, slope;
	struct nlnopt_s *nlp = nnp->nlp;

	nnp->num_calls++;  // increment the count of the function calls

	n = nn_vector_dim(nnp); 

	xkp  = nlp->xkp;   // the point, mapped to wxmp 
	gkp  = nlp->gkp;   // the gradient, mapped to dwmxp
	dkp  = nlp->dkp;   // the search direction
	gk1p = nlp->gk1p;  // the backup copy of gkp
	
	xk1p = nlp->xk1p;  // the backup copy of the xkp point
	
	// Compute the new point xkp and the function and slope values.
	for ( i = 0; i < n; i++  ) {
		xkp[i] += alpha * dkp[i];
	}
	// vector_add_scaled(n, xkp, dkp, alpha);

	nn_ff_layers(nnp); // <-- FF pass, computing the output error
	nn_bp_layers(nnp); // <-- BP pass, computing the gradient of weight matrix 
	f = nn_cost(nnp);  // <-- now the cost/error for optimization
	
	// Save the computed gradient vector and compute the slope
	// for the new point xkp.
	/*
	slope = 0;
	for ( i = 0; i < n; i++  ) {
		slope += gkp[i] * dkp[i];
	}*/
	slope = vectors_inner_prod(n, gkp, dkp);
	*s = slope;

	return f;
}



/**------ Compute beta (various algorithms), new search direction dkp
 * and set the new X and its partial derivatives / gradient -----**/

/*
 *  In the NN process, xkp/gkp stores the computed optimal point and
 *  its corresponding gradient vector while  xk1p/gk1p stores the 
 *  previous point and the previous point's gradient vector.  
 *
 *  That is,  
 *   --- previous ----- current ----
 *      x0 = xk1p  and  xk = xkp
 *      g0 = gk1p  and  gk = gkp
 *
 */
double nn_nlnopt_nextiter(struct nlnopt_s *nlp)
{
	unsigned int i, n , v;
	double  beta, b_pr, b_hs, b_fr, b_ls, b_dy, b_hdy, b_hz, hz_s, s,
		*ykp, *dkp, *xkp, *xk1p, *gkp, *gk1p,
		ip_yk, ip_dkyk, ip_gk, ip_dkgk, ip_gk1, ip_gk1yk;
	
	v = 0;   // verbose printing: 1 = yes; 0 = no
	n = nlnopt_num_dims(nlp);
	xkp = nlp->xkp;     xk1p = nlp->xk1p;
	gkp = nlp->gkp;     gk1p = nlp->gk1p;
	ykp = nlp->ykp;     dkp  = nlp->dkp;

	for ( i = 0; i < n; i++ ) {    //          --- current - previous ---
		ykp[i] = gkp[i] - gk1p[i]; // By definition, yk = gk1 - gk0
	}
	
	ip_gk1yk = vectors_inner_prod(n, gkp,  ykp);  // gk1.T * yk
	ip_dkyk  = vectors_inner_prod(n, dkp,  ykp);  // dk.T  * yk
	ip_dkgk  = vectors_inner_prod(n, dkp,  gk1p); // gk.T  * dk
	
	ip_yk    = vectors_inner_prod(n, ykp,  ykp);  // yk.T  * yk
	ip_gk1   = vectors_inner_prod(n, gkp,  gkp);  // gk1.T * gk1
	ip_gk    = vectors_inner_prod(n, gk1p, gk1p); // gk.T  * gk

	b_hs = ip_gk1yk/ip_dkyk;       // Hestenes and Stiefel
	b_fr = ip_gk1/ip_gk;           // Fletcher and Reeve
	b_pr = ip_gk1yk/ip_gk;         // Polak and Ribiere
	b_ls =-ip_gk1yk/ip_dkgk;       // Liu and Storey
	b_dy = ip_gk1/ip_dkyk;         // Dai and Yuan
	
	hz_s = -2 * ip_yk / ip_dkyk;   // Hager and Zhang

	vector_add_scaled(n, ykp, dkp, hz_s);  // Note this will destroy yk.
	b_hz = vectors_inner_prod(n, ykp, gkp)/ip_dkyk;
	
	b_hdy = ip_gk / (max(ip_dkyk, -ip_dkgk) );

	if ( 0 ) {
		beta = b_pr;
	} else if ( 1 ) {
		beta = b_hz;     // seems to be the best 
	} else if ( 0 ) {
		beta = b_hdy;
	} else {
		beta = b_dy;
	}
	

	if ( isinf(beta) ) {
		sprintf(dbg, "isinf() = %g", beta); DBG(dbg); 
		exit(1);
	}

	for ( i = 0; i < n; i++ ) {
		dkp[i] = beta * dkp[i] - gkp[i];  // this is the new search direction.
	}
	
	// ---- the slope at the new 'optimal' point ----
	s = vectors_inner_prod(n, dkp, gkp);
	
	if ( s > 0 ) {  // If this slope is postive, then we have
		s = 0;      // to fall back to the steepest descent. 
		for ( i = 0; i < n; i++ ) {
			s += gkp[i] * gkp[i];
			dkp[i] = -gkp[i];
			gkp[i] = -gkp[i];
			xk1p[i] = xkp[i];   // save xkp to xk1p as the new starting point
			gk1p[i] = -0.5 *gkp[i];
		}
		s = -s;
	
	} else {

		for ( i = 0; i < n; i++ ) {
			xk1p[i] = xkp[i];   // save xkp to xk1p as the new starting point
			gk1p[i] = gkp[i];
		}
	}
	
	return s;
}



// ---
// --- save a copy of the data in vector xkp to xk1p 
// ---  and a copy of the gradient in vector gkp to gk1p
// ---
void nn_nlnopt_x2x1(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nlnopt_s *nlp = nnp->nlp;
	double *xkp, *gkp, *xk1p, *gk1p;
	
	n = nlp->num_dims;

	xkp = nlp->xkp;   xk1p = nlp->xk1p;
	gkp = nlp->gkp;   gk1p = nlp->gk1p;

	for ( i = 0; i < n; i++ ) {
		xk1p[i] = xkp[i];
		
		// ----- save a copy of gkp int gk1p too ------
		if ( 1 ) {
			gk1p[i] = -gkp[i];  
		} else {
			gk1p[i] = gkp[i];
		}
	}
}


void nn_nlnopt_x1x2(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nlnopt_s *nlp = nnp->nlp;
	double *xkp, *gkp, *xk1p, *gk1p;
	
	n = nlp->num_dims;

	xkp = nlp->xkp;   xk1p = nlp->xk1p;
	gkp = nlp->gkp;   gk1p = nlp->gk1p;

	for ( i = 0; i < n; i++ ) {
		xkp[i] = xk1p[i];
		gkp[i] = gk1p[i];
	}
}



/*
 *  Use (symmetric) finite difference method to compute the approximate  
 *  partial derivatives.  The obtained result is used to compare with the
 *  results obatined by the NN BP process. 
 *
 */
void nn_check_grad(struct nn_s *nnp)
{

	nn_dbg_init_Xy(nnp);
	nn_dbg_init_weights(nnp);

	nn_netinfo_show(nnp);
	nn_dbg_dump_model(nnp);

	nn_finite_num_diff(nnp);
	
	nn_ff_layers(nnp);
	nn_bp_layers(nnp);
	
	nn_grad_relative_diff(nnp);
	
	//nn_nlnopt2(nnp);


}

/** ---------- Dump the model for debugging --------- **/
void nn_dbg_dump_model(struct nn_s *nnp)
{
	unsigned int i, n;
	char buf[BUF_LEN], buf2[BUF_LEN];
	struct matrix_dsc_s *wmxp, *bmxp;
	struct nn_layer_s **lpp, *lp;

	n = nn_num_lps(nnp);
	lpp = nn_get_lpp(nnp);
	for ( i = 0; i < n; i++ ) {
		lp = lpp[i];
		wmxp = lp->wmxp;
		bmxp = lp->bmxp;
		sprintf(buf,  "w%ddbg", i);  matrix_write_file(buf, wmxp);
		sprintf(buf,  "b%ddbg", i);  matrix_write_file(buf, bmxp);
	}

}

void nn_dbg_init_Xy(struct nn_s *nnp)
{
	unsigned int i, n, bs, num_ins, num_labels; 
	struct nn_layer_s **lpp, *lp;
	struct matrix_op_s *mop = MOP(nnp);

	//nnp->num_labels = num_labels = 3;
	nnp->num_labels = num_labels = nn_num_labels(nnp);
	n = nn_num_lps(nnp);
	num_ins = nnp->unit_layer_lp[0];
	bs = nn_batch_size(nnp);
	
	nnp->Xmxp = matrix_create2(mop, "X", bs, num_ins);
	nnp->tmxp = matrix_create2(mop, "t", bs, num_labels);
	
	nn_dbg_init_mxp(nnp->Xmxp);
	matrix_print(nnp->Xmxp);

	nn_dbg_init_y(nnp);
	matrix_print(nnp->tmxp);

	lpp = nn_get_lpp(nnp);
	lp = lpp[0];
	lp->zmxp = nnp->Xmxp;
	
	if ( 0 ) {
		matrix_write_file("XX", nnp->Xmxp);
		matrix_write_file("yy", nnp->tmxp);
	}
}

/** Compute the approximate partial derivative by  numerical finite difference **/
void  nn_finite_num_diff(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nlnopt_s *nlp = nnp->nlp;
	double *xkp, *gk1p, *ykp, e, g, loss1, loss2;

	e = 0.0001;   // epsilon 1e-4
	
	xkp = nlp->xkp;          ykp = nlp->ykp;
	gk1p = nlp->gk1p;

	n = nn_vector_dim(nnp);
	for ( i = 0; i < n; i++ ) {
		xkp[i] -= e;           // w -= e
		
		nn_ff_layers(nnp);
		loss1 = nn_cost(nnp);
		
		xkp[i] += 2 * e;       // w += e  actually
		nn_ff_layers(nnp);
		loss2 = nn_cost(nnp);
		
		g = (loss2 - loss1) / (2 * e); 
		gk1p[i] = g;

		xkp[i] -= e;           // original w actually

		// ---- temporarily store the cost -----
		ykp[i] = ( loss1 + loss2 )/2.0;
	}	

	
}

/** ----- Compare relative difference so that we can measure the accuracy of BP gradients  ------**/
void nn_grad_relative_diff(struct nn_s *nnp)
{
	unsigned int i, n, debug;
	struct nlnopt_s *nlp = nnp->nlp;
	double *gkp, *gk1p, *ykp, nom, denom, cost,
		g1, g2, d1, d2, rel_diff;

	debug = 1;
	nom  = denom = 0;
	n    = nn_vector_dim(nnp);
	gkp  = nlp->gkp;        ykp = nlp->ykp;
	gk1p = nlp->gk1p;

	if ( debug ) {
		DBG("---finite------- BP -------diff -----cost--"); 
		DBG("-diff apprx-- gradient ----- %  -----------"); 
	}

	for ( i = 0; i < n; i++ ) {
		g1 = gkp[i], g2 = gk1p[i];
		cost = ykp[i];

		if ( debug ) {
			sprintf(dbg, "%' '-10.5e  %' '-10.5e  %' 'e %' '-10.5e", 
				g2, g1, 1-g2/g1, cost);  DBG(dbg); 
		}
	
		d1 = g2 - g1;
		d2 = g2 + g1; 

		nom   += d1 * d1;
		denom += d2 * d2;
		
	}

	rel_diff = sqrt(nom/denom);

	sprintf(dbg, " The relative difference is %8.3g", rel_diff);
	DBG(dbg); 
	
}

/** NN check gradients: finite numerical difference method **/
void nn_dbg_init_weights(struct nn_s *nnp)
{
	unsigned int i, n;
	struct nlnopt_s *nlp = nnp->nlp;
	double *xkp;

	xkp = nlp->xkp;
	n = nn_vector_dim(nnp);
	
	for ( i = 0; i < n; i++ ) {
		xkp[i] = sin( (double) i ) *  0.1; 
	}

}

/** 
 *  Init the matrix with deterministic function so as to make it
 *  easier to debug.
 *
 **/
void nn_dbg_init_mxp(struct matrix_dsc_s *mxp)
{
	unsigned int i, j, m, n, k;
	double **rpp, *rp, r;

	m = MXROWS(mxp);   rpp = MXRPP(mxp);
	n = MXCOLS(mxp);   k = 0;

	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		for ( j = 0; j < n; j++ ) {
			r = (double)k++;
			rp[j] = sin(r) * 0.1;
		}
	}
}

void nn_dbg_init_y(struct nn_s *nnp)
{
	unsigned int i, j, m, t;
	double **rpp, *rp;
	struct matrix_dsc_s *tmxp;

	tmxp = nnp->tmxp;
	m = MXROWS(tmxp);  rpp = MXRPP(tmxp);
	
	t = 1;
	for ( i = 0; i < m; i++ ) {
		rp = rpp[i];
		j = ( (t + i )% 3 ); 
		rp[j] =  1;
	}

}



