/* -- rcmreg.c 
 Calculates the regression coefficient estimates in a (possibly multivariate) 
 multiple linear regression model using the lift rank covariance matrix. 
 
 The method is described in:

  E. Ollila, H. Oja and V. Koivunen : "Estimates of regression 
  coefficients based on lift rank covariance matrix," Journal 
  of the American Statistical Association, vol. 98, no. 461, 
  pp. 90 -- 98, 2003. 

 author:  Esa Ollila, esollila@wooster.hut.fi
          http://cc.oulu.fi/~esollila
 
 For some documentation and examples please visit my web-page.

 This C-code is a public domain, altough please leave my name, 
 email address as well as the web address attached. If you use 
 this program or any subroutines, please cite the above mentioned 
 refrence  
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <string.h>
#include <ctype.h>
#include "rcmreg.h"

#define AUTHOR Esa Ollila
#define EMAIL esollila@wooster.hut.fi
#define WWW "http://cc.oulu.fi/~esollila/"
#define TRUE 1
#define FALSE 0
#define MAX 128 /* maxlength of the character string */

int main(int argc, char* argv[])
{	                          
        int k,n,nry,nrboot,boot=0,nrx,i,j;
	double  **data,**score,**B,**cov,**err;      	
	unsigned long arg;
        FILE *fp;       
        char  infile[MAX];

	if (argc==1) {
	   Get_Input("The name of the data file    : ",infile,STRING);
	   Get_Input("Number of observations       : ",&n,POS_INT);
	   do {
	      Get_Input("Number of variables          : ",&k,POS_INT);
	      if (n<=k) printf("\n***> Not enough cases <***\n\n");
	   } while (n<=k);
	   do { 
	      Get_Input("Number of response variables : ",&nry,POS_INT); 
	      if (nry>=k) printf("\n***> Too many (must be less than %d)<***\n\n",k);          
	   } while (nry>=k);	  
	       
	   printf("\nNOTE: The response variable(s) are expected to be the"); 
	   printf("\n      last column(s) in the data set.\n");
         
	   printf("\nDo you wish to estimate the standard errors by the bootstrap ?\n");
	   Get_Input("Please answer Yes or No : ",&boot,YES_NO);     

	   if (boot) {
	      Get_Input("The number of bootstrap samples : ",&nrboot,POS_INT);	  	
	   }

	} else if (argc == 9 || argc == 11) {
	      
	   if ((arg = StringExists("-f",argc,argv))>0) strcpy(infile,argv[arg+1]);
	   else ERROR(flag -f name_of_file missing); 
	   if ((arg = StringExists("-n",argc,argv))>0) n = atoi(argv[arg+1]);
	   else ERROR(flag -n nr_of_observations missing);
	   if ((arg = StringExists("-k",argc,argv))>0) k = atoi(argv[arg+1]);
	   else ERROR(flag -k nr_of_variables missing);
	   if ((arg = StringExists("-y",argc,argv))>0) nry = atoi(argv[arg+1]);
	   else ERROR(flag -y nr_of_responses missing);
	   if (argc==11) {
	      if ((arg = StringExists("-b",argc,argv))>0) {
	          boot  = 1;
	          nrboot = atoi(argv[arg+1]);
	      } else ERROR(flag -b nr_of_boot_samples is missing);
	   }

	   if (k<2) ERROR(Too few variables. Atleast too is needed);     
	   if (n<=k) ERROR(Not enough cases);
	   if (nry>k) ERROR(Number of reponse variables exceeds the total number of variables);  

	} else ERROR(invalid argument list);

	fp  = OpenFile(infile,"r");

	data  = dmatrix(1,n,1,k);   
        score = dmatrix(1,n,1,k+1);  

        for (i=1;i<=n;i++) { 
	    for (j=1;j<=k;j++) { 
	        if (fscanf(fp,"%lf",&data[i][j])==EOF && i<=n) {	  
	           printf("\nERROR: not enough cases in the data file!\n\n"); 	    	
		   exit(-1);
		}
	    }
	}

	fclose(fp);       

	nrx    = k-nry;
	B      = dmatrix(1,nrx+1,1,nry);  /* The RCM regr. coefficients */
	cov    = dmatrix(1,k+1,1,k+1);    /* Contains the rank covariance matrix */	
	err    = dmatrix(1,nrx+1,1,nry);  /* The Bootstrap std. errors of RCM estimates */ 

	printf("\n - now calculating regression estimates...\n");
	OjaRanks(data,score,n,k);    	
	RCM(score,cov,n,k);
	RCMReg(B,cov,n,k,nry);   	
	if (boot) { 
	    printf(" - now bootstrapping...\n");
	    Bootstrap(err,B,data,n,k,nry,nrboot);	 
	}
	
	Print_Results(B,cov,err,score,n,k,nry,boot);  
            
	free_dmatrix(cov,1,k+1,1,k+1);
	free_dmatrix(err,1,nrx+1,1,nry);
	free_dmatrix(B,1,nrx+1,1,nry);
	free_dmatrix(data,1,n,1,k);
	free_dmatrix(score,1,n,1,k);
	return 0;
}

void OjaRanks(double **data,double **score,int n,int k)
{   
        int i,j,l,k1,*p,*indx,sp;   
	unsigned long cnt;	
	double d,d0,d1,d2,**dmat,*dp,*h;      

	for (i=1;i<=n;i++) 
	    for (j=1;j<=k+1;j++)
		score[i][j] = 0;      

	if (k==2) { 
                   
	    for (i=1;i<=n-1;i++) {
	        for (j=i+1;j<=n;j++) {	     	      
	            d0 = data[j][2]*data[i][1]-data[i][2]*data[j][1];
		    d1 = -(data[j][2]-data[i][2]);
		    d2 =  data[j][1]-data[i][1];
		    for (l=1;l<=n;l++) { 
		        d = d0+d1*data[l][1]+d2*data[l][2];
			if (fabs(d)>SMALL) {	       
			   sp = SIGN(d);		    		  
			   score[l][1] += sp*d0;
			   score[l][2] += sp*d1;	           
			   score[l][3] += sp*d2;
			}
		    }
		}		  
	    }

	    sp = n*(n-1)/2;	       
	    for (i=1;i<=n;i++) {
	        score[i][1] = score[i][1]/sp;	
	        score[i][2] = score[i][2]/sp;	
		score[i][3] = score[i][3]/sp;	
	    }

	} else { 	   

	   p = ivector(1,k);  /* indices*/       
	   h = dvector(1,k);  /* dummy */
	   indx = ivector(1,k+1); /* for ludcmp and lubksb */ 
	   dmat = dmatrix(1,k+1,1,k+1); /* dummy */
	   dp = dvector(1,k+1); /* cofactors */

	   i=0;
	   do {  
	      d  = (double)rand();
	      if (d!=0)  h[++i] = d/RAND_MAX;
	   } while (i<k);
       	
	   for (j=1;j<=k;j++) p[j] = j; 

	   cnt= 0;	
	   k1=k+1;
	   do {
	      cnt += 1;
	      for (i=1;i<=k+1;i++) dmat[i][1] = 1;
	      for (i=1;i<=k;i++) 
	          for (j=1;j<=k;j++) 
		      dmat[j][i+1] = data[p[j]][i];
	      for (i=1;i<=k;i++) dmat[k+1][i+1] = h[i];	  
	      j = ludcmp(dmat,k+1,indx,&d); 	     
	      for (i=1;i<=k+1;i++) d *= dmat[i][i]; 	
	      if (j==0 && fabs(d)>1.0e-9) {
		 for (i=1,dp[k+1]=1;i<=k;i++) dp[i]=0;	
		 lubksb(dmat,k+1,indx,dp);	
		 for (i=1;i<=k+1;i++) dp[i] *= d;	  
		 for (i=1;i<=n;i++) {	       
	             for (j=1;j<=k;j++) {
		         if (i==p[j]) break;
		     }	   
		     if (j==k1) { 		 
		        for (j=1,d=dp[1];j<=k;j++) d += data[i][j]*dp[j+1];		
			if (fabs(d)>SMALL) {	       
		           sp = SIGN(d);		    		  
			   for (j=1;j<=k1;j++) score[i][j] += sp*dp[j];	           
			}
		     }
		 }
	      }
	      Nextp(n,k,p,&j);
	      if (!j) break;	    
	   } while(1);

	   for (i=1;i<=n;i++) 
	       for (j=1;j<=k+1;j++) 
		   score[i][j] = score[i][j]/cnt;	

	   free_ivector(p,1,k);
	   free_ivector(indx,1,k+1);
	   free_dvector(h,1,k);       
	   free_dvector(dp,1,k+1);       
	   free_dmatrix(dmat,1,k+1,1,k+1);
	}
}	

void RCM(double **score,double **cov,int n, int k)
{
        int i,j,l;

	for (i=1;i<=k+1;i++) {
            for (j=i;j<=k+1;j++) {
	        cov[i][j]=0;
                for (l=1;l<=n;l++) cov[i][j] += score[l][i]*score[l][j];                
                cov[i][j] /= n;
		cov[j][i] = cov[i][j];
            }
	}      
}
      
void RCMReg(double **B,double **cov,int n,int k,int nry)
{
       double **C22,*col,d;
       int i,j,l,nrb,*indx;

       if (nry==1) {	    
            for (i=1;i<=k;i++) B[i][1] = -cov[i][k+1]/cov[k+1][k+1];   
       } else {

	    nrb  = k+1-nry;
	    C22  = dmatrix(1,nry,1,nry);
	    col  = dvector(1,nry);
	    indx = ivector(1,nry); 

	    for (i=nrb+1;i<=k+1;i++) 
	        for (j=nrb+1;j<=k+1;j++)
		    C22[i-nrb][j-nrb] = cov[i][j];	    
	    j=ludcmp(C22,nry,indx,&d);
	    for (i=1;i<=nry;i++) d *= C22[i][i]; 	    
	    if (j==-1 || fabs(d)<1.0e-9) {
	       printf("Singular matrix in RCMReg()."); exit(5);
	    }	   
	    for (j=1;j<=nrb;j++){
                for (i=1;i<=nry;i++) col[i]=cov[nrb+i][j];	      
		lubksb(C22,nry,indx,col);
		for(i=1;i<=nry;i++) B[j][i]= -col[i];
	    }
	    free_dvector(col,1,nry);
	    free_ivector(indx,1,nry); 
	    free_dmatrix(C22,1,nry,1,nry);
       }
}    
  
void Nextp(int n, int k, int *q, int *lc)
{
	int i;

	*lc = k;
	if (q[k]<n) {
	   q[k] +=1;
	} else {
		do {
		    *lc = *lc-1;
		    if ( q[*lc]< (n-k+(*lc))) break;
		} while ( (*lc) !=0 );

		if ((*lc)>0) {
		    q[(*lc)] += 1;
		    for (i=(*lc)+1;i<=k;i++) q[i] = q[i-1]+1;
		}
	}
}

void Set_Seed(long *idum)
/* pick up a random nonpositive seed, that is not equal to zero */ 
{ 
      long s;
      s = (long)time(NULL);     
      s =  labs(s);  
      if (s==0) *idum=-222333; /* some value, that is not zero */
      else *idum = -s;          
}

void Bootstrap(double **err,double **B,double **data,int n,int k,int nry,int nrboot)
{
       double **beta,**score,**data2,**cov,numb,pr;
       int i,j,l,cnt,nrb,nrx,plkm,*pnts;
       long idum;
       
       nrb = k-nry+1; 
       nrx = k-nry;

       beta  = dmatrix(1,nrboot,1,nry*nrb);  /* matrix of bootstr. estimates */    
       score = dmatrix(1,n,1,k+1); /* bootstrap ranks */
       data2 = dmatrix(1,n,1,k); /* bootstrap ranks */
       cov   = dmatrix(1,k+1,1,k+1); /* RCM */
       pnts  = ivector(1,n); /* randomly chocen points */
       

       /* copy the data */
       for (i=1;i<=n;i++) 
	   for (j=1;j<=k;j++) 
	        data2[i][j] = data[i][j];

       pr = nrboot/10.0;
       plkm = 1;

       Set_Seed(&idum);     

       for (cnt=1;cnt<=nrboot;cnt++) {
 	    
	     i=0;
	     do { 
	         i++;
  	         numb = ran1(&idum);
	         pnts[i] = (int)(n*numb+1.0);
	     } while (i<n);

	     for (i=1;i<=n;i++)
	         for (j=1;j<=k;j++) 
		     data[i][j] = data2[pnts[i]][j];	   

	     OjaRanks(data,score,n,k);          
	     RCM(score,cov,n,k);
	     RCMReg(err,cov,n,k,nry); 
	   
	     for (i=1;i<=nry;i++) 
	         for (j=1;j<=nrb;j++) 
		     beta[cnt][(i-1)*nrb+j] = err[j][i];		 
	   
	     if (cnt>plkm*pr) { 
	      	printf("\t%3d percent of the calculations performed...\n",plkm*10);
		plkm++;
	     } 
       }

       for (i=1;i<=nry;i++) {
	   for (j=1;j<=nrb;j++) {	       
	       for (l=1,numb = 0;l<=nrboot;l++) numb +=  beta[l][(i-1)*nrb+j];	
	       numb /= nrboot;		     
	       for (l=1,pr=0;l<=nrboot;l++) pr += (beta[l][(i-1)*nrb+j]-numb)*(beta[l][(i-1)*nrb+j]-numb);
	       err[j][i] = pr/(nrboot-1);	              
	       err[j][i] = sqrt(err[j][i]);
	   }
       }      

       free_dmatrix(beta,1,nrboot,1,nry*nrb);
       free_dmatrix(cov,1,k+1,1,k+1);
       free_ivector(pnts,1,n);
       free_dmatrix(score,1,n,1,k+1); 
       free_dmatrix(data2,1,n,1,k); 
}

void Print_Results(double **B,double **cov,double **err,double **score,int n,int k,int nry,int boot)
{
       int i,j,nrx;
       double min,max;
       FILE *fp;

       nrx = k-nry;     
       fp = OpenFile("results.out","w");

       /* check out what precision would be nice for printing: if 
       exponent greater that 8 -> use %e else use %f */
       min = 1.0e200;
       max = -1.0e200;
       for (i=1;i<=n;i++) {
	   for (j=1;j<=k+1;j++) {
	       if (score[i][j]<min) min = score[i][j];
	       if (score[i][j]>max) max = score[i][j];
	   }
       }

       fprintf(fp,"\nOJA LIFT RANKS:");
       fprintf(fp,"\n***************\n\n");
       fprintf(fp,"%4s ","obs.");
       for (i=0;i<=k;i++) fprintf(fp,"%15s(%d) ","R",i);
       fprintf(fp,"\n");

       if (fabs(min)>1.0e8 || fabs(max)>1.0e8) { 
	  for (i=1;i<=n;i++) {
	      fprintf(fp,"%-4d ",i); 
	      for (j=1;j<=k+1;j++) {
		  fprintf(fp,"%18e ",score[i][j]);
	      }
	      fprintf(fp,"\n");
	  }
       } else { 
	  for (i=1;i<=n;i++) {
	      fprintf(fp,"%-4d ",i); 
	      for (j=1;j<=k+1;j++) {
		  fprintf(fp,"%18.5f ",score[i][j]);
	      }
	      fprintf(fp,"\n");
	  }
       }

       max = -1.0e200;
       for (i=1;i<=k;i++) 
	   for (j=1;j<=k;j++)  	   
	       if (cov[i][j]>max) max = cov[i][j];   

       fprintf(fp,"\nLIFT RANK COVARIANCE MATRIX:");
       fprintf(fp,"\n****************************\n\n");

       if (fabs(max)>1.0e9) { 
	  for (i=1;i<=k+1;i++) {
	      for (j=1;j<=k+1;j++) {
		  fprintf(fp,"%18e ",cov[i][j]);
	      }
	      fprintf(fp,"\n");
	  }
       } else { 
	 for (i=1;i<=k+1;i++) {
	      for (j=1;j<=k+1;j++) {
		  fprintf(fp,"%18.5f ",cov[i][j]);
	      }
	      fprintf(fp,"\n");
	  }
       }

       fprintf(fp,"\nRCM REGRESSION: \n***************\n");
       fprintf(stdout,"\nRCM REGRESSION: \n***************\n");

       Print_Coefs_and_Errors(fp,B,err,nry,k,boot);
       Print_Coefs_and_Errors(stdout,B,err,nry,k,boot);
             
       fprintf(stdout,"\nNOTE: More detailed output (e.g. lift ranks, lift RCM, regression \n"
	       "coefficients) are written to a file results.out\n\n");
}

void Print_Coefs_and_Errors(FILE *fp,double **B,double **err,int nry,int k,int boot)
{
       int i,j,nrx;
       nrx = k-nry;    

       fprintf(fp,"\ncoefficients:\n");  
       if (nry>1) {
	  fprintf(fp,"%12s"," ");
	  for (i=1;i<=nry;i++) fprintf(fp,"%14s%-2d ","y",i);
	  fprintf(fp,"\n");
	  for (i=1;i<=nrx+1;i++) {	 	 
	      if (i==1) fprintf(fp,"%12s","intercept");
	      else fprintf(fp,"%10s%-2d","x",i-1);
	      for (j=1;j<=nry;j++) {
	         fprintf(fp,"%16.5f ",B[i][j]);  
	      }
	      fprintf(fp,"\n");
	  }
       } else { 
	  fprintf(fp,"%11s : %16.5f\n","intercept",B[1][1]);
	  for (i=2;i<=k;i++)  fprintf(fp,"%10s%d : %16.5f\n","b",i-1,B[i][1]);   
       }

       if (boot) {
	  fprintf(fp,"\nstandard error:\n");
	  if (nry>1) {
	     fprintf(fp,"%13s","");
	     for (i=1;i<=nry;i++) fprintf(fp,"%14s%-2d ","y",i);
	     fprintf(fp,"\n");
	     for (i=1;i<=nrx+1;i++) {	 	 
	         if (i==1) fprintf(fp,"%13s","intercept");
		 else fprintf(fp,"%11s%-2d","x",i-1);
		 for (j=1;j<=nry;j++) {
	             fprintf(fp,"%16.5f ",err[i][j]);  
		 }
		 fprintf(fp,"\n");
	     }
	  } else { 
	     fprintf(fp,"%11s : %16.5f\n","intercept",err[1][1]);
	     for (i=2;i<=k;i++) fprintf(fp,"%10s%d : %16.5f\n","b",i-1,err[i][1]);   
	  }	  	
       }
}

FILE *OpenFile(char name[], char mode[])
{
	FILE *fp;

	fp= fopen(name,mode);
	if (fp==NULL){
	    perror(name);
	    fprintf(stderr,"..now exiting the system\n");
	    exit (2);
	}
	return fp;
}

unsigned long StringExists(char *s, int argc,char *argv[])
{
    int i;

    for (i=1;i<argc;i++) {
	if (!strcmp(argv[i], s)) return i;
    }
    return 0;
}

void Get_Input(char *prompt, void *val,enum Type type)
{
   char a[MAX];
   int i,j,l,ok;

   do {
      ok = TRUE;
      printf("%s", prompt);   
      scanf("%s",a);
      l = strlen(a);

      switch (type) { 

         case POS_INT : {	 
	  
	   if ( a[0]=='-') ok=FALSE;
	   else { 
	      if (a[0]=='+') j=1;
	      else j=0;
	      for (i=j;i<l;i++) {
		  if (!(a[i]>='0' && a[i]<='9')) {
		    ok=FALSE;		
		    break;
		  }
	      }
	   }	 
	   if (ok==TRUE) *(int *)val = atoi(a);	       	  
	 }     
         break;

         case YES_NO : { 	   
	   for (i=0;i<l;i++) a[i] = tolower(a[i]);	  	
           if (strcmp("yes",a)==0 || strcmp("y",a)==0) *(int *)val = 1;
           else if (strcmp("no",a)==0 || strcmp("n",a)==0) *(int *)val = 0;
	   else ok = FALSE;
	 }
	 break;

         case STRING  :
	   strcpy((char *)val,a);	 
	 break;
      }
      if (ok == FALSE) printf("\n***> Invalid Response <***\n\n");         
   } while (ok==FALSE);

}


