openModeller
Version 1.4.0
|
00001 // 00002 // CsmBS 00003 // 00004 // Description: Csm implementation using Broken Stick method to discard components 00005 // 00006 // 00007 // Author: CRIA <t.sutton@reading.ac.uk>, (C) 2004 00008 // 00009 // Copyright: See COPYING file that comes with this distribution 00010 // 00011 // 00012 00013 #ifdef WIN32 00014 // avoid warnings caused by problems in VC headers 00015 #define _SCL_SECURE_NO_DEPRECATE 00016 #endif 00017 00018 #include <string.h> 00019 #include <iostream> 00020 #include "csmbs.hh" 00021 00022 #include <gsl/gsl_statistics_double.h> 00023 #include <gsl/gsl_multifit_nlin.h> //remove this when we have proper covar matrix 00024 #include <gsl/gsl_math.h> 00025 #include <gsl/gsl_eigen.h> 00026 #include <gsl/gsl_sf_gamma.h> 00027 #include <gsl/gsl_randist.h> 00028 #include <gsl/gsl_permutation.h> 00029 #include <gsl/gsl_heapsort.h> 00030 #include <gsl/gsl_sort.h> 00031 #include <gsl/gsl_sort_vector.h> 00032 00033 #include <math.h> 00034 //next includes needed for rand 00035 #include <stdlib.h> 00036 #include <time.h> 00037 /****************************************************************/ 00038 /********************** Algorithm's Metadata ********************/ 00039 00040 #define NUM_PARAM 4 00041 00042 00043 /************************************/ 00044 /*** Algorithm parameter metadata ***/ 00045 00046 static AlgParamMetadata parameters[NUM_PARAM] = { 00047 00048 // Metadata of the first parameter. 00049 { 00050 "Randomisations", // Id. 00051 "Number of random eigenvalues", // Name. 00052 Integer, // Type. 00053 "The number of eigenvalues to generate from randomly 'shuffled' environment data.", //overview 00054 "The Broken Stick method of selecting the number of components to keep \ 00055 is carried out by randomising the row order of each column in the environmental \ 00056 matrix and then obtaining the eigen value for the randomised matrix. \ 00057 This is repeatedly carried out for the amount of times specified by the user here.", // Description. 00058 00059 1, // Not zero if the parameter has lower limit. 00060 1, // Parameter's lower limit. 00061 1, // Not zero if the parameter has upper limit. 00062 1000, // Parameter's upper limit. 00063 "8" // Parameter's typical (default) value. 00064 } 00065 , 00066 { 00067 "StandardDeviations", // Id. 00068 "Number of standard deviations", // Name. 00069 Real, // Type. 00070 "The number of standard deviations added to the randomised eigen value.", //overview 00071 "When all the eigen values for the 'shuffled' environmental matrix have been summed \ 00072 this number of standard deviations is added to the mean of the eigen values. \ 00073 Any components whose eigen values are above this threshold are retained.", // Description. 00074 00075 1, // Not zero if the parameter has lower limit. 00076 -10, // Parameter's lower limit. 00077 1, // Not zero if the parameter has upper limit. 00078 10, // Parameter's upper limit. 00079 "2.0" // Parameter's typical (default) value. 00080 } 00081 , 00082 { 00083 "MinComponents", // Id. 00084 "Minimum number of components in model", // Name. 00085 Integer, // Type. 00086 "The minimum number of components that the model must have.", //overview 00087 "If not enough components are selected, the model produced will be erroneous or fail. \ 00088 Usually three or more components are acceptable", // Description. 00089 1, // Not zero if the parameter has lower limit. 00090 1, // Parameter's lower limit. 00091 1, // Not zero if the parameter has upper limit. 00092 20, // Parameter's upper limit. 00093 "1" // Parameter's typical (default) value. 00094 } 00095 , 00096 { 00097 "VerboseDebugging", // Id. 00098 "Show very detailed debugging info", // Name. 00099 Integer, // Type. 00100 "Warning this will cause a large amount of information to be printed ", //overview 00101 "Set this to 1 to show extremely verbose diagnostics. \ 00102 Set this to 0 to disable verbose diagnostics (this is default behaviour).", // Description. 00103 1, // Not zero if the parameter has lower limit. 00104 0, // Parameter's lower limit. 00105 1, // Not zero if the parameter has upper limit. 00106 1, // Parameter's upper limit. 00107 "0" // Parameter's typical (default) value. 00108 } 00109 }; 00110 00111 00112 /************************************/ 00113 /*** Algorithm's general metadata ***/ 00114 00115 static AlgMetadata metadata = { 00116 00117 "CSMBS", // Id. 00118 "Climate Space Model", // Name. 00119 "0.4", // Version. 00120 00121 "Climate Space Model [CSM] is a principle components based \ 00122 algorithm developed by Dr. Neil Caithness", //Overview 00123 00124 "Climate Space Model [CSM] is a principle components based \ 00125 algorithm developed by Dr. Neil Caithness. The component \ 00126 selection process int this algorithm implementation is \ 00127 based on the Broken-Stick cutoff where any component with \ 00128 an eigenvalue less than (n stddevs above a randomised sample) is discarded. \n\ 00129 The original CSM was written as series of Matlab functions. ", //description 00130 00131 "Neil Caithness", // Author 00132 "Robertson M.P., Caithness N., Villet M.H. (2001) A PCA-based modelling technique for predicting environmental suitability for organisms from presence records. Diversity and Distributions, 7:15-27", // Bibliography. 00133 00134 "Tim Sutton, Renato De Giovanni", // Code author. 00135 "t.sutton [at] reading.ac.uk", // Code author's contact. 00136 00137 0, // Does not accept categorical data. 00138 0, // Does not need (pseudo)absence points. 00139 00140 NUM_PARAM, // Algorithm's parameters. 00141 parameters 00142 }; 00143 00144 00145 00146 /****************************************************************/ 00147 /****************** Algorithm's factory function ****************/ 00148 00149 OM_ALG_DLL_EXPORT 00150 AlgorithmImpl * 00151 algorithmFactory() 00152 { 00153 return new CsmBS(); 00154 } 00155 00156 OM_ALG_DLL_EXPORT 00157 AlgMetadata const * 00158 algorithmMetadata() 00159 { 00160 return &metadata; 00161 } 00162 00163 /****************************************************************/ 00164 /****************************** Csm *****************************/ 00165 00170 CsmBS::CsmBS() : Csm(&metadata) 00171 { 00172 _initialized = 0; 00173 //setup gsl random number generator 00174 const gsl_rng_type * myRngType; 00175 gsl_rng_env_setup(); 00176 myRngType = gsl_rng_default; 00177 _randomNumberGenerator = gsl_rng_alloc(myRngType); 00178 //seed the random number generator 00179 gsl_rng_set(_randomNumberGenerator, (unsigned) time( NULL )); 00180 } 00181 00182 00184 CsmBS::~CsmBS() 00185 { 00186 gsl_rng_free(_randomNumberGenerator); 00187 } 00188 00189 00190 int CsmBS::initialize() 00191 { 00192 Log::instance()->debug( "Starting CSM - Broken Stick\n" ); 00193 //set up parameters 00194 if ( ! getParameter( "Randomisations", &numberOfRandomisationsInt ) ) 00195 { 00196 Log::instance()->warn( "Parameter Randomisations not set.\n"); 00197 return 0; 00198 } 00199 if ( ! getParameter( "StandardDeviations", &numberOfStdDevsFloat) ) 00200 { 00201 Log::instance()->warn( "Parameter StandardDeviations not set.\n"); 00202 return 0; 00203 } 00204 if ( ! getParameter( "MinComponents", &minComponentsInt) ) 00205 { 00206 Log::instance()->warn( "Parameter MinComponents not set.\n"); 00207 return 0; 00208 } 00209 int myTempInt=0; 00210 if ( ! getParameter( "VerboseDebugging", &myTempInt) ) 00211 { 00212 Log::instance()->warn( "Verbose debugging parameter not set.\n"); 00213 return 0; 00214 } 00215 00216 Log::instance()->debug( "Randomisations parameter set to: %d\n", numberOfRandomisationsInt ); 00217 Log::instance()->debug( "StandardDeviations parameter set to: %.4f\n", numberOfStdDevsFloat ); 00218 Log::instance()->debug( "MinComponents parameter set to: %d\n", minComponentsInt ); 00219 00220 if ( ! ( myTempInt == 0 || myTempInt == 1 ) ) 00221 { 00222 Log::instance()->warn( "CSM - Broken Stick - Verbose debugging parameter out of range: %d\n", 00223 myTempInt); 00224 return 0; 00225 } 00226 00227 if (myTempInt!=0) 00228 { 00229 verboseDebuggingBool=true; 00230 Log::instance()->debug( "VerboseDebugging parameter set to: TRUE\n" ); 00231 } 00232 else 00233 { 00234 verboseDebuggingBool=false; 00235 Log::instance()->debug( "VerboseDebugging parameter set to: FALSE \n" ); 00236 } 00237 00238 if ( numberOfRandomisationsInt <= 0 || numberOfRandomisationsInt > 1000 ) 00239 { 00240 Log::instance()->warn( "CSM - Broken Stick - Randomisations parameter out of range: %f\n", 00241 numberOfRandomisationsInt ); 00242 return 0; 00243 } 00244 if ( numberOfStdDevsFloat<= -10 || numberOfStdDevsFloat> 10 ) 00245 { 00246 Log::instance()->warn( "CSM - Broken Stick - StandardDeviations parameter out of range: %f\n", 00247 numberOfRandomisationsInt ); 00248 return 0; 00249 } 00250 if (minComponentsInt < 1 ||minComponentsInt > 20 ) 00251 { 00252 Log::instance()->warn( "CSM - Broken Stick - MinComponents parameter out of range: %f\n", 00253 minComponentsInt); 00254 return 0; 00255 } 00256 00257 //call the superclass initialier now... 00258 return Csm::initialize(); 00259 } 00260 00261 gsl_matrix * CsmBS::createRandomMatrix(int size1, int size2) 00262 { 00263 00264 //populate the matrix with random nos 00265 gsl_matrix * m = gsl_matrix_alloc (size1, size2); 00266 for (unsigned int j=0; j < m->size2; j++) 00267 { 00268 for (unsigned int k=0; k < m->size1; k++) 00269 { 00270 double myNo = gsl_rng_uniform_pos(_randomNumberGenerator); 00271 gsl_matrix_set(m,k,j,myNo); 00272 } 00273 } 00274 return m; 00275 } 00276 00277 /* randomise the data within each column */ 00278 gsl_matrix * CsmBS::randomiseColumns(gsl_matrix * original_matrix) 00279 { 00280 //for debugging only 00281 //gsl_matrix * myIndexMatrix = gsl_matrix_alloc (original_matrix->size1, original_matrix->size2); 00282 gsl_matrix * myOuputMatrix = gsl_matrix_alloc (original_matrix->size1, original_matrix->size2); 00283 gsl_matrix * m = createRandomMatrix (original_matrix->size1, original_matrix->size2); 00284 //loop through the matrix columns 00285 for (unsigned int j=0; j < m->size2; j++) 00286 { 00287 //get the column of random numbers 00288 gsl_vector * myRandomColumnVector = gsl_vector_alloc (m->size1); 00289 gsl_vector * myOriginalColumnVector = gsl_vector_alloc (original_matrix->size1); 00290 gsl_matrix_get_col (myRandomColumnVector, m, j); 00291 gsl_matrix_get_col (myOriginalColumnVector, original_matrix, j); 00292 gsl_permutation * myPermutation = gsl_permutation_alloc(m->size1); 00293 //compute the index positions of the sorted random col 00294 gsl_sort_vector_index(myPermutation,myRandomColumnVector); 00295 //assign values in the output array based on their index relative to the sorted 00296 //randomised matrix 00297 for (unsigned int k=0; k < m->size1; k++) 00298 { 00299 double myDouble = gsl_vector_get(myOriginalColumnVector,myPermutation->data[k]); 00300 //for debugging only 00301 //gsl_matrix_set(myIndexMatrix,k,j,myPermutation->data[k]); 00302 //write the elemnt straight into the output matrix 00303 gsl_matrix_set(myOuputMatrix,k,j,myDouble); 00304 } //k loop 00305 //set the output column to the randomly sorted column 00306 //clean up 00307 gsl_permutation_free (myPermutation); 00308 gsl_vector_free (myRandomColumnVector); 00309 gsl_vector_free (myOriginalColumnVector); 00310 }//j loop 00311 gsl_matrix_free(m); 00312 //for debuggin only 00313 //displayMatrix(myIndexMatrix,"Random matrix indexes"); 00314 //gsl_matrix_free(myIndexMatrix); 00315 return myOuputMatrix; 00316 } 00317 00318 int CsmBS::discardComponents() 00319 { 00320 Log::instance()->debug( "Discarding components\n" ); 00321 00322 //create a matrix that will store the eigenvalue vector of each of the 00323 //the randomised environment variables we create 00324 gsl_matrix * myMatrixOfEigenValueVectors = 00325 gsl_matrix_alloc (numberOfRandomisationsInt,_gsl_environment_matrix->size2); 00326 Log::instance()->debug( "Calculating %i randomised matrices...\n", numberOfRandomisationsInt ); 00327 for (int i=0; i<numberOfRandomisationsInt;i++) 00328 { 00329 Log::instance()->debug( "Calculating randomised matrix: %i\n", i ); 00330 00331 // 00332 //retreive centered and standardised environmental matrix & clone it 00333 // 00334 00335 // displayMatrix(_gsl_environment_matrix,"Cloning centered and standardised matrix:"); 00336 gsl_matrix * m = randomiseColumns(_gsl_environment_matrix); 00337 //displayMatrix(m,"Randomised matrix:"); 00338 // 00339 // build a cov matrix on the randomised environmental matrix 00340 // 00341 gsl_matrix * myGslCovarianceMatrix = autoCovariance(m); 00342 00343 // obtain eigenvalue and eigenvector on the new cov matrix 00344 gsl_vector * myGslEigenValueVector = gsl_vector_alloc (m->size2); 00345 gsl_matrix * myGslEigenVectorMatrix = gsl_matrix_alloc (m->size2, m->size2); 00346 //create a temporary workspace 00347 gsl_eigen_symmv_workspace * myWorkpace = gsl_eigen_symmv_alloc (m->size2); 00348 gsl_eigen_symmv (myGslCovarianceMatrix, 00349 myGslEigenValueVector, 00350 myGslEigenVectorMatrix, 00351 myWorkpace); 00352 //free the temporary workspace again 00353 gsl_eigen_symmv_free (myWorkpace); 00354 displayVector(myGslEigenValueVector, "Randomised eigenvaluevector"); 00355 // 00356 // sort the eigenvalues from > to < then discard eigenvector 00357 // 00358 gsl_eigen_symmv_sort (myGslEigenValueVector, myGslEigenVectorMatrix, 00359 GSL_EIGEN_SORT_VAL_DESC); 00360 00361 // keep only the eigen values and add them as a new row to a matrix 00362 // int gsl_matrix_set_row (gsl_matrix * m, size_t i, const gsl_vector * v) 00363 gsl_matrix_set_row (myMatrixOfEigenValueVectors,i,myGslEigenValueVector); 00364 00365 // clean up temp vectors and matrix 00366 gsl_vector_free (myGslEigenValueVector); 00367 gsl_matrix_free(myGslEigenVectorMatrix); 00368 // clear temp covariance matrix 00369 gsl_matrix_free(myGslCovarianceMatrix); 00370 gsl_matrix_free (m); 00371 // repeat as many times as numberOfRandomisationsInt, ading a new row to the matrix each time 00372 }//i loop 00373 00374 // now calculate the mean and stddev of each col of output matrix 00375 // note we must initialise these vectors before calling mean&stddev fn 00376 gsl_vector * myMeanVector = gsl_vector_alloc(myMatrixOfEigenValueVectors->size2); 00377 gsl_vector * myStdDevVector = gsl_vector_alloc(myMatrixOfEigenValueVectors->size2);; 00378 calculateMeanAndSd( myMatrixOfEigenValueVectors,myMeanVector,myStdDevVector); 00379 displayVector (myMeanVector,"Mean of randomised Eigen Vectors"); 00380 // in a new vector save the mean plus (numberOfStdDeviationsFloat * stddev) 00381 gsl_vector * myMeanPlusStdDevsVector = gsl_vector_alloc(myMeanVector->size); 00382 for (unsigned int i=0; i<myMeanVector->size; ++i) 00383 { 00384 double myMean = gsl_vector_get (myMeanVector,i); 00385 double myStdDev = gsl_vector_get (myStdDevVector,i); 00386 gsl_vector_set(myMeanPlusStdDevsVector,i,myMean+(myStdDev*numberOfStdDevsFloat)); 00387 } 00388 displayVector (myMeanPlusStdDevsVector,"Mean of randomised Eigen Vectors plus std deviation"); 00389 // First determine how many components we are going to keep 00390 // We do this by iterating through the eigenvalues, checking which are above the mean+stddev 00391 00392 // 00393 // Note: as soon as the first non kept compoinent is encountered, all 00394 // remaining components are discarded. This is because on 64bit 00395 // platform it can happen that some component scores at the 'small' 00396 // end of the components have a value above the random threshold 00397 // which is not desired. 00398 // 00399 00400 double sumOfEigenValues = 0;//sum should total number of layers 00401 _retained_components_count=0; 00402 for (unsigned int i=0; i<myMeanPlusStdDevsVector->size; ++i) 00403 { 00404 double cmpValue = gsl_vector_get(myMeanPlusStdDevsVector,i); 00405 sumOfEigenValues += gsl_vector_get(_gsl_eigenvalue_vector,i); 00406 if (cmpValue < gsl_vector_get(_gsl_eigenvalue_vector,i)) 00407 { 00408 ++_retained_components_count; 00409 //std::cerr << gsl_vector_get(_gsl_eigenvalue_vector,i) << " > " << cmpValue << ": Component " 00410 // << i << " is greater than randomised component... retaining it." << std::endl; 00411 } 00412 else 00413 { 00414 break; 00415 } 00416 } 00417 //std::cerr << "Sum of eigenvalues is " << sumOfEigenValues << " (layer count is " << _layer_count << ")\n"; 00418 //std::cerr << "Difference between sum of eigenvalues and layer count = number of invariant layers" << std::endl; 00419 if (_retained_components_count < minComponentsInt) 00420 { 00421 Log::instance()->debug( "Only %i component(s) retained. %i required. \nAborting discard components routine\n",_retained_components_count, minComponentsInt ); 00422 gsl_vector_free (myMeanVector); 00423 gsl_vector_free (myStdDevVector); 00424 gsl_vector_free (myMeanPlusStdDevsVector); 00425 gsl_matrix_free (myMatrixOfEigenValueVectors); 00426 return 0; 00427 } 00428 00429 // this vector is then used to discard component from the eigenvalue vector where 00430 // the eigen value is greater than the randomised eigen value 00431 00432 //so now create a local copy of the eigvect and eigval... 00433 //first clone the vector and matrix... 00434 gsl_vector * tmp_gsl_eigenvalue_vector = gsl_vector_alloc (_layer_count); 00435 gsl_matrix * tmp_gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _layer_count); 00436 00437 gsl_matrix_memcpy (tmp_gsl_eigenvector_matrix, _gsl_eigenvector_matrix); 00438 gsl_vector_memcpy (tmp_gsl_eigenvalue_vector,_gsl_eigenvalue_vector); 00439 00440 //now clear our current eig vec and matrix 00441 00442 gsl_vector_free (_gsl_eigenvalue_vector); 00443 gsl_matrix_free (_gsl_eigenvector_matrix); 00444 00445 //next we reassign them to the reduced size (ie without unwanted components) 00446 _gsl_eigenvalue_vector = gsl_vector_alloc (_retained_components_count); 00447 _gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _retained_components_count); 00448 00449 //now copy over just the components we intend to keep into the blank resized 00450 //_gsl_eigenvalue_vector and _gsl_eigenvector_matrix 00451 int myCurrentColInt=0; 00452 for (int j=0;j<_retained_components_count;j++) 00453 { 00454 double cmpValue = gsl_vector_get (tmp_gsl_eigenvalue_vector,j); 00455 if (cmpValue > gsl_vector_get (myMeanPlusStdDevsVector,j)) 00456 { 00457 //Good this is a component we want to keep 00458 //...first copy over the vector value we are retaining 00459 gsl_vector_set (_gsl_eigenvalue_vector,myCurrentColInt,cmpValue); 00460 //...next copy over the associated matrix column for the component we are retaining 00461 gsl_vector * tmp_gsl_vector = gsl_vector_alloc (_layer_count); 00462 gsl_matrix_get_col (tmp_gsl_vector, tmp_gsl_eigenvector_matrix, j); 00463 gsl_matrix_set_col (_gsl_eigenvector_matrix,myCurrentColInt,tmp_gsl_vector); 00464 gsl_vector_free (tmp_gsl_vector); 00465 //increment the column counter 00466 myCurrentColInt++; 00467 }//if check for retained componnet 00468 }//j loop 00469 //now clear away the temporary vars 00470 gsl_vector_free (tmp_gsl_eigenvalue_vector); 00471 gsl_matrix_free (tmp_gsl_eigenvector_matrix); 00472 00473 //clean up 00474 gsl_vector_free (myMeanVector); 00475 gsl_vector_free (myStdDevVector); 00476 gsl_vector_free (myMeanPlusStdDevsVector); 00477 gsl_matrix_free (myMatrixOfEigenValueVectors); 00478 00479 displayVector( _gsl_eigenvalue_vector, "Vector of retained eigen values:"); 00480 displayMatrix( _gsl_eigenvector_matrix,"Matrix of retained eigen vector:"); 00481 00482 Log::instance()->debug( "Completed CSM - Broken Stick\n" ); 00483 Log::instance()->debug( "%i out of %i components retained \n", _retained_components_count, _layer_count ); 00484 return 1; 00485 }