openModeller  Version 1.4.0
csmbs.cpp
Go to the documentation of this file.
00001 //
00002 // CsmBS
00003 //
00004 // Description: Csm implementation using Broken Stick method to discard components
00005 //
00006 //
00007 // Author: CRIA <t.sutton@reading.ac.uk>, (C) 2004
00008 //
00009 // Copyright: See COPYING file that comes with this distribution
00010 //
00011 //
00012 
00013 #ifdef WIN32
00014 // avoid warnings caused by problems in VC headers
00015 #define _SCL_SECURE_NO_DEPRECATE
00016 #endif
00017 
00018 #include <string.h>
00019 #include <iostream>
00020 #include "csmbs.hh"
00021 
00022 #include <gsl/gsl_statistics_double.h>
00023 #include <gsl/gsl_multifit_nlin.h> //remove this when we have proper covar matrix
00024 #include <gsl/gsl_math.h>
00025 #include <gsl/gsl_eigen.h>
00026 #include <gsl/gsl_sf_gamma.h>
00027 #include <gsl/gsl_randist.h>
00028 #include <gsl/gsl_permutation.h>
00029 #include <gsl/gsl_heapsort.h>
00030 #include <gsl/gsl_sort.h>
00031 #include <gsl/gsl_sort_vector.h>
00032 
00033 #include <math.h>
00034 //next includes needed for rand
00035 #include <stdlib.h>
00036 #include <time.h>
00037 /****************************************************************/
00038 /********************** Algorithm's Metadata ********************/
00039 
00040 #define NUM_PARAM 4
00041 
00042 
00043 /************************************/
00044 /*** Algorithm parameter metadata ***/
00045 
00046 static AlgParamMetadata parameters[NUM_PARAM] = {
00047 
00048       // Metadata of the first parameter.
00049       {
00050         "Randomisations", // Id.
00051         "Number of random eigenvalues", // Name.
00052         Integer, // Type.
00053         "The number of eigenvalues to generate from randomly 'shuffled' environment data.", //overview
00054         "The Broken Stick method of selecting the number of components to keep \
00055 is carried out by randomising the row order of each column in the environmental \
00056 matrix and then obtaining the eigen value for the randomised matrix. \
00057 This is repeatedly carried out for the amount of times specified by the user here.", // Description.
00058 
00059         1, // Not zero if the parameter has lower limit.
00060         1, // Parameter's lower limit.
00061         1, // Not zero if the parameter has upper limit.
00062         1000, // Parameter's upper limit.
00063         "8" // Parameter's typical (default) value.
00064       }
00065       ,
00066       {
00067         "StandardDeviations", // Id.
00068         "Number of standard deviations", // Name.
00069         Real, // Type.
00070         "The number of standard deviations added to the randomised eigen value.", //overview
00071         "When all the eigen values for the 'shuffled' environmental matrix have been summed \
00072 this number of standard deviations is added to the mean of the eigen values. \
00073 Any components whose eigen values are above this threshold are retained.", // Description.
00074 
00075         1, // Not zero if the parameter has lower limit.
00076         -10, // Parameter's lower limit.
00077         1, // Not zero if the parameter has upper limit.
00078         10, // Parameter's upper limit.
00079         "2.0" // Parameter's typical (default) value.
00080       }
00081       ,
00082       {
00083         "MinComponents", // Id.
00084         "Minimum number of components in model", // Name.
00085         Integer, // Type.
00086         "The minimum number of components that the model must have.", //overview
00087         "If not enough components are selected, the model produced will be erroneous or fail. \
00088 Usually three or more components are acceptable", // Description.
00089         1, // Not zero if the parameter has lower limit.
00090         1, // Parameter's lower limit.
00091         1, // Not zero if the parameter has upper limit.
00092         20, // Parameter's upper limit.
00093         "1" // Parameter's typical (default) value.
00094       }
00095       ,
00096       {
00097         "VerboseDebugging", // Id.
00098         "Show very detailed debugging info", // Name.
00099         Integer, // Type.
00100         "Warning this will cause a large amount of information to be printed ", //overview
00101         "Set this to 1 to show extremely verbose diagnostics. \
00102 Set this to 0 to disable verbose diagnostics (this is default behaviour).", // Description.
00103         1, // Not zero if the parameter has lower limit.
00104         0, // Parameter's lower limit.
00105         1, // Not zero if the parameter has upper limit.
00106         1, // Parameter's upper limit.
00107         "0" // Parameter's typical (default) value.
00108       }
00109     };
00110 
00111 
00112 /************************************/
00113 /*** Algorithm's general metadata ***/
00114 
00115 static AlgMetadata metadata = {
00116 
00117     "CSMBS",               // Id.
00118     "Climate Space Model", // Name.
00119     "0.4",                 // Version.
00120 
00121     "Climate Space Model [CSM] is a principle components based \
00122 algorithm developed by Dr. Neil Caithness", //Overview
00123 
00124     "Climate Space Model [CSM] is a principle components based \
00125 algorithm developed by Dr. Neil Caithness. The component \
00126 selection process int this algorithm implementation is \
00127 based on the Broken-Stick cutoff where any component with \
00128 an eigenvalue less than (n stddevs above a randomised sample) is discarded. \n\
00129 The original CSM was written as series of Matlab functions. ", //description
00130 
00131     "Neil Caithness",  // Author
00132     "Robertson M.P., Caithness N., Villet M.H. (2001) A PCA-based modelling technique for predicting environmental suitability for organisms from presence records. Diversity and Distributions, 7:15-27",  // Bibliography.
00133 
00134     "Tim Sutton, Renato De Giovanni",  // Code author.
00135     "t.sutton [at] reading.ac.uk",     // Code author's contact.
00136 
00137     0,  // Does not accept categorical data.
00138     0,  // Does not need (pseudo)absence points.
00139 
00140     NUM_PARAM,   // Algorithm's parameters.
00141     parameters
00142 };
00143 
00144 
00145 
00146 /****************************************************************/
00147 /****************** Algorithm's factory function ****************/
00148 
00149 OM_ALG_DLL_EXPORT
00150 AlgorithmImpl *
00151 algorithmFactory()
00152 {
00153   return new CsmBS();
00154 }
00155 
00156 OM_ALG_DLL_EXPORT
00157 AlgMetadata const *
00158 algorithmMetadata()
00159 {
00160   return &metadata;
00161 }
00162 
00163 /****************************************************************/
00164 /****************************** Csm *****************************/
00165 
00170 CsmBS::CsmBS() : Csm(&metadata)
00171 {
00172   _initialized = 0;
00173   //setup gsl random number generator
00174   const gsl_rng_type * myRngType;
00175   gsl_rng_env_setup();
00176   myRngType = gsl_rng_default;
00177   _randomNumberGenerator = gsl_rng_alloc(myRngType);
00178   //seed the random number generator
00179   gsl_rng_set(_randomNumberGenerator, (unsigned) time( NULL ));
00180 }
00181 
00182 
00184 CsmBS::~CsmBS()
00185 {
00186   gsl_rng_free(_randomNumberGenerator);
00187 }
00188 
00189 
00190 int CsmBS::initialize()
00191 {
00192   Log::instance()->debug( "Starting CSM - Broken Stick\n" );
00193   //set up parameters
00194   if ( ! getParameter( "Randomisations", &numberOfRandomisationsInt ) )
00195   {
00196     Log::instance()->warn( "Parameter Randomisations not set.\n");
00197     return 0;
00198   }
00199   if ( ! getParameter( "StandardDeviations", &numberOfStdDevsFloat) )
00200   {
00201     Log::instance()->warn( "Parameter StandardDeviations not set.\n");
00202     return 0;
00203   }
00204   if ( ! getParameter( "MinComponents", &minComponentsInt) )
00205   {
00206     Log::instance()->warn( "Parameter MinComponents not set.\n");
00207     return 0;
00208   }
00209   int myTempInt=0;
00210   if ( ! getParameter( "VerboseDebugging", &myTempInt) )
00211   {
00212     Log::instance()->warn( "Verbose debugging parameter not set.\n");
00213     return 0;
00214   }
00215   
00216   Log::instance()->debug( "Randomisations parameter set to: %d\n", numberOfRandomisationsInt );
00217   Log::instance()->debug( "StandardDeviations parameter set to: %.4f\n", numberOfStdDevsFloat );
00218   Log::instance()->debug( "MinComponents parameter set to: %d\n", minComponentsInt );
00219 
00220   if ( ! ( myTempInt == 0 || myTempInt == 1 ) )
00221   {
00222     Log::instance()->warn( "CSM - Broken Stick - Verbose debugging parameter out of range: %d\n",
00223                  myTempInt);
00224     return 0;
00225   }
00226 
00227   if (myTempInt!=0)
00228   {
00229     verboseDebuggingBool=true;
00230     Log::instance()->debug( "VerboseDebugging parameter set to: TRUE\n" );
00231   }
00232   else
00233   {
00234     verboseDebuggingBool=false;
00235     Log::instance()->debug( "VerboseDebugging parameter set to: FALSE \n" );
00236   }
00237 
00238   if ( numberOfRandomisationsInt <= 0 || numberOfRandomisationsInt > 1000 )
00239   {
00240     Log::instance()->warn( "CSM - Broken Stick - Randomisations parameter out of range: %f\n",
00241                 numberOfRandomisationsInt );
00242     return 0;
00243   }
00244   if ( numberOfStdDevsFloat<= -10 || numberOfStdDevsFloat> 10 )
00245   {
00246     Log::instance()->warn( "CSM - Broken Stick - StandardDeviations parameter out of range: %f\n",
00247                 numberOfRandomisationsInt );
00248     return 0;
00249   }
00250   if (minComponentsInt < 1 ||minComponentsInt > 20 )
00251   {
00252     Log::instance()->warn( "CSM - Broken Stick - MinComponents parameter out of range: %f\n",
00253                 minComponentsInt);
00254     return 0;
00255   }
00256 
00257   //call the superclass initialier now...
00258   return Csm::initialize();
00259 }
00260 
00261 gsl_matrix * CsmBS::createRandomMatrix(int size1, int size2)
00262 {
00263 
00264   //populate the matrix with random nos
00265   gsl_matrix * m = gsl_matrix_alloc (size1, size2);
00266   for (unsigned int j=0; j < m->size2; j++)
00267   {
00268     for (unsigned int k=0; k < m->size1; k++)
00269     {
00270       double myNo = gsl_rng_uniform_pos(_randomNumberGenerator);
00271       gsl_matrix_set(m,k,j,myNo);
00272     }
00273   }
00274   return m;
00275 }
00276 
00277 /* randomise the data within each column */
00278 gsl_matrix * CsmBS::randomiseColumns(gsl_matrix * original_matrix)
00279 {
00280   //for debugging only
00281   //gsl_matrix * myIndexMatrix = gsl_matrix_alloc (original_matrix->size1, original_matrix->size2);
00282   gsl_matrix * myOuputMatrix = gsl_matrix_alloc (original_matrix->size1, original_matrix->size2);
00283   gsl_matrix * m = createRandomMatrix (original_matrix->size1, original_matrix->size2);
00284   //loop through the matrix columns
00285   for (unsigned int j=0; j < m->size2; j++)
00286   {
00287     //get the column of random numbers
00288     gsl_vector * myRandomColumnVector = gsl_vector_alloc (m->size1);
00289     gsl_vector * myOriginalColumnVector = gsl_vector_alloc (original_matrix->size1);
00290     gsl_matrix_get_col (myRandomColumnVector, m, j);
00291     gsl_matrix_get_col (myOriginalColumnVector, original_matrix, j);
00292     gsl_permutation * myPermutation = gsl_permutation_alloc(m->size1);
00293     //compute the index positions of the sorted random col
00294     gsl_sort_vector_index(myPermutation,myRandomColumnVector);
00295     //assign values in the output array based on their index relative to the sorted
00296     //randomised matrix
00297     for (unsigned int k=0; k < m->size1; k++)
00298     {
00299       double myDouble = gsl_vector_get(myOriginalColumnVector,myPermutation->data[k]);
00300       //for debugging only
00301       //gsl_matrix_set(myIndexMatrix,k,j,myPermutation->data[k]);
00302       //write the elemnt straight into the output matrix
00303       gsl_matrix_set(myOuputMatrix,k,j,myDouble);
00304     } //k loop
00305     //set the output column to the randomly sorted column
00306     //clean up
00307     gsl_permutation_free (myPermutation);
00308     gsl_vector_free (myRandomColumnVector);
00309     gsl_vector_free (myOriginalColumnVector);
00310   }//j loop
00311   gsl_matrix_free(m);
00312   //for debuggin only
00313   //displayMatrix(myIndexMatrix,"Random matrix indexes");
00314   //gsl_matrix_free(myIndexMatrix);
00315   return myOuputMatrix;
00316 }
00317 
00318 int CsmBS::discardComponents()
00319 {
00320   Log::instance()->debug( "Discarding components\n" );
00321 
00322   //create a matrix that will store the eigenvalue vector of each of the
00323   //the randomised environment variables we create
00324   gsl_matrix * myMatrixOfEigenValueVectors =
00325     gsl_matrix_alloc (numberOfRandomisationsInt,_gsl_environment_matrix->size2);
00326   Log::instance()->debug( "Calculating %i randomised matrices...\n", numberOfRandomisationsInt );
00327   for (int i=0; i<numberOfRandomisationsInt;i++)
00328   {
00329     Log::instance()->debug( "Calculating randomised matrix: %i\n", i );
00330 
00331     //
00332     //retreive centered and standardised environmental matrix & clone it
00333     //
00334 
00335     // displayMatrix(_gsl_environment_matrix,"Cloning centered and standardised matrix:");
00336     gsl_matrix * m = randomiseColumns(_gsl_environment_matrix);
00337     //displayMatrix(m,"Randomised matrix:");
00338     //
00339     //  build a cov matrix on the randomised environmental matrix
00340     //
00341     gsl_matrix * myGslCovarianceMatrix = autoCovariance(m);
00342 
00343     //  obtain eigenvalue and eigenvector on the new cov matrix
00344     gsl_vector * myGslEigenValueVector = gsl_vector_alloc (m->size2);
00345     gsl_matrix * myGslEigenVectorMatrix = gsl_matrix_alloc (m->size2, m->size2);
00346     //create a temporary workspace
00347     gsl_eigen_symmv_workspace * myWorkpace = gsl_eigen_symmv_alloc (m->size2);
00348     gsl_eigen_symmv (myGslCovarianceMatrix,
00349                      myGslEigenValueVector,
00350                      myGslEigenVectorMatrix,
00351                      myWorkpace);
00352     //free the temporary workspace again
00353     gsl_eigen_symmv_free (myWorkpace);
00354     displayVector(myGslEigenValueVector, "Randomised eigenvaluevector");
00355     //
00356     //   sort the eigenvalues from > to  < then discard eigenvector
00357     //
00358     gsl_eigen_symmv_sort (myGslEigenValueVector, myGslEigenVectorMatrix,
00359                           GSL_EIGEN_SORT_VAL_DESC);
00360 
00361     //  keep only the eigen values and add them as a new row to a matrix
00362     //  int gsl_matrix_set_row (gsl_matrix * m, size_t i, const gsl_vector * v)
00363     gsl_matrix_set_row (myMatrixOfEigenValueVectors,i,myGslEigenValueVector);
00364 
00365     // clean up temp vectors and matrix
00366     gsl_vector_free (myGslEigenValueVector);
00367     gsl_matrix_free(myGslEigenVectorMatrix);
00368     //  clear temp covariance matrix
00369     gsl_matrix_free(myGslCovarianceMatrix);
00370     gsl_matrix_free (m);
00371     //  repeat as many times as numberOfRandomisationsInt, ading a new row to the matrix each time
00372   }//i loop
00373 
00374   //  now calculate the mean and stddev of each col of output matrix
00375   //  note we must initialise these vectors before calling mean&stddev fn
00376   gsl_vector * myMeanVector = gsl_vector_alloc(myMatrixOfEigenValueVectors->size2);
00377   gsl_vector * myStdDevVector = gsl_vector_alloc(myMatrixOfEigenValueVectors->size2);;
00378   calculateMeanAndSd( myMatrixOfEigenValueVectors,myMeanVector,myStdDevVector);
00379   displayVector (myMeanVector,"Mean of randomised Eigen Vectors");
00380   //  in a new vector save the mean plus (numberOfStdDeviationsFloat * stddev)
00381   gsl_vector * myMeanPlusStdDevsVector = gsl_vector_alloc(myMeanVector->size);
00382   for (unsigned int i=0; i<myMeanVector->size; ++i)
00383   {
00384     double myMean = gsl_vector_get (myMeanVector,i);
00385     double myStdDev = gsl_vector_get (myStdDevVector,i);
00386     gsl_vector_set(myMeanPlusStdDevsVector,i,myMean+(myStdDev*numberOfStdDevsFloat));
00387   }
00388   displayVector (myMeanPlusStdDevsVector,"Mean of randomised Eigen Vectors plus std deviation");
00389   // First determine how many components we are going to keep
00390   // We do this by iterating through the eigenvalues, checking which are above the mean+stddev
00391 
00392   //
00393   // Note: as soon as the first non kept compoinent is encountered, all 
00394   // remaining components are discarded. This is because on 64bit
00395   // platform it can happen that some component scores at the 'small'
00396   // end of the components have a value above the random threshold
00397   // which is not desired.
00398   //
00399   
00400   double sumOfEigenValues = 0;//sum should total number of layers
00401   _retained_components_count=0;
00402   for (unsigned int i=0; i<myMeanPlusStdDevsVector->size; ++i)
00403   {
00404     double cmpValue = gsl_vector_get(myMeanPlusStdDevsVector,i);
00405     sumOfEigenValues += gsl_vector_get(_gsl_eigenvalue_vector,i);
00406     if (cmpValue < gsl_vector_get(_gsl_eigenvalue_vector,i))
00407     {
00408       ++_retained_components_count;
00409       //std::cerr << gsl_vector_get(_gsl_eigenvalue_vector,i) << " > " << cmpValue << ": Component "
00410       //  << i << " is greater than randomised component... retaining it." << std::endl;
00411     }
00412     else
00413     {
00414       break;
00415     }
00416   }
00417   //std::cerr << "Sum of eigenvalues is " << sumOfEigenValues << " (layer count is " << _layer_count  << ")\n";
00418   //std::cerr << "Difference between sum of eigenvalues and layer count = number of invariant layers" << std::endl;
00419   if (_retained_components_count < minComponentsInt)
00420   {
00421     Log::instance()->debug( "Only %i component(s) retained. %i required. \nAborting discard components routine\n",_retained_components_count, minComponentsInt );
00422     gsl_vector_free (myMeanVector);
00423     gsl_vector_free (myStdDevVector);
00424     gsl_vector_free (myMeanPlusStdDevsVector);
00425     gsl_matrix_free (myMatrixOfEigenValueVectors);
00426     return 0;
00427   }
00428 
00429   //  this vector is then used to discard component from the eigenvalue vector where
00430   //  the eigen value is greater than the randomised eigen value
00431 
00432   //so now create a local copy of the eigvect and eigval...
00433   //first clone the vector and matrix...
00434   gsl_vector * tmp_gsl_eigenvalue_vector = gsl_vector_alloc (_layer_count);
00435   gsl_matrix * tmp_gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _layer_count);
00436 
00437   gsl_matrix_memcpy (tmp_gsl_eigenvector_matrix, _gsl_eigenvector_matrix);
00438   gsl_vector_memcpy (tmp_gsl_eigenvalue_vector,_gsl_eigenvalue_vector);
00439 
00440   //now clear our current eig vec and matrix
00441 
00442   gsl_vector_free (_gsl_eigenvalue_vector);
00443   gsl_matrix_free (_gsl_eigenvector_matrix);
00444 
00445   //next we reassign them to the reduced size (ie without unwanted components)
00446   _gsl_eigenvalue_vector = gsl_vector_alloc (_retained_components_count);
00447   _gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _retained_components_count);
00448 
00449   //now copy over just the components we intend to keep into the blank resized
00450   //_gsl_eigenvalue_vector and _gsl_eigenvector_matrix
00451   int myCurrentColInt=0;
00452   for (int j=0;j<_retained_components_count;j++)
00453   {
00454     double cmpValue = gsl_vector_get (tmp_gsl_eigenvalue_vector,j);
00455     if (cmpValue > gsl_vector_get (myMeanPlusStdDevsVector,j))
00456     {
00457       //Good this is a component we want to keep
00458       //...first copy over the vector value we are retaining
00459       gsl_vector_set (_gsl_eigenvalue_vector,myCurrentColInt,cmpValue);
00460       //...next copy over the associated matrix column for the component we are retaining
00461       gsl_vector * tmp_gsl_vector = gsl_vector_alloc (_layer_count);
00462       gsl_matrix_get_col (tmp_gsl_vector, tmp_gsl_eigenvector_matrix, j);
00463       gsl_matrix_set_col (_gsl_eigenvector_matrix,myCurrentColInt,tmp_gsl_vector);
00464       gsl_vector_free (tmp_gsl_vector);
00465       //increment the column counter
00466       myCurrentColInt++;
00467     }//if check for retained componnet
00468   }//j loop
00469   //now clear away the temporary vars
00470   gsl_vector_free (tmp_gsl_eigenvalue_vector);
00471   gsl_matrix_free (tmp_gsl_eigenvector_matrix);
00472 
00473   //clean up
00474   gsl_vector_free (myMeanVector);
00475   gsl_vector_free (myStdDevVector);
00476   gsl_vector_free (myMeanPlusStdDevsVector);
00477   gsl_matrix_free (myMatrixOfEigenValueVectors);
00478 
00479   displayVector( _gsl_eigenvalue_vector, "Vector of retained eigen values:");
00480   displayMatrix( _gsl_eigenvector_matrix,"Matrix of retained eigen vector:");
00481 
00482   Log::instance()->debug( "Completed CSM - Broken Stick\n" );
00483   Log::instance()->debug( "%i out of %i components retained \n", _retained_components_count, _layer_count );
00484   return 1;
00485 }