openModeller  Version 1.5.0
csmbs.cpp
Go to the documentation of this file.
1 //
2 // CsmBS
3 //
4 // Description: Csm implementation using Broken Stick method to discard components
5 //
6 //
7 // Author: CRIA <t.sutton@reading.ac.uk>, (C) 2004
8 //
9 // Copyright: See COPYING file that comes with this distribution
10 //
11 //
12 
13 #ifdef WIN32
14 // avoid warnings caused by problems in VC headers
15 #define _SCL_SECURE_NO_DEPRECATE
16 #endif
17 
18 #include <string.h>
19 #include <iostream>
20 #include "csmbs.hh"
21 
22 #include <gsl/gsl_statistics_double.h>
23 #include <gsl/gsl_multifit_nlin.h> //remove this when we have proper covar matrix
24 #include <gsl/gsl_math.h>
25 #include <gsl/gsl_eigen.h>
26 #include <gsl/gsl_sf_gamma.h>
27 #include <gsl/gsl_randist.h>
28 #include <gsl/gsl_permutation.h>
29 #include <gsl/gsl_heapsort.h>
30 #include <gsl/gsl_sort.h>
31 #include <gsl/gsl_sort_vector.h>
32 
33 #include <math.h>
34 //next includes needed for rand
35 #include <stdlib.h>
36 #include <time.h>
37 /****************************************************************/
38 /********************** Algorithm's Metadata ********************/
39 
40 #define NUM_PARAM 4
41 
42 
43 /************************************/
44 /*** Algorithm parameter metadata ***/
45 
47 
48  // Metadata of the first parameter.
49  {
50  "Randomisations", // Id.
51  "Number of random eigenvalues", // Name.
52  Integer, // Type.
53  "The number of eigenvalues to generate from randomly 'shuffled' environment data.", //overview
54  "The Broken Stick method of selecting the number of components to keep \
55 is carried out by randomising the row order of each column in the environmental \
56 matrix and then obtaining the eigen value for the randomised matrix. \
57 This is repeatedly carried out for the amount of times specified by the user here.", // Description.
58 
59  1, // Not zero if the parameter has lower limit.
60  1, // Parameter's lower limit.
61  1, // Not zero if the parameter has upper limit.
62  1000, // Parameter's upper limit.
63  "8" // Parameter's typical (default) value.
64  }
65  ,
66  {
67  "StandardDeviations", // Id.
68  "Number of standard deviations", // Name.
69  Real, // Type.
70  "The number of standard deviations added to the randomised eigen value.", //overview
71  "When all the eigen values for the 'shuffled' environmental matrix have been summed \
72 this number of standard deviations is added to the mean of the eigen values. \
73 Any components whose eigen values are above this threshold are retained.", // Description.
74 
75  1, // Not zero if the parameter has lower limit.
76  -10, // Parameter's lower limit.
77  1, // Not zero if the parameter has upper limit.
78  10, // Parameter's upper limit.
79  "2.0" // Parameter's typical (default) value.
80  }
81  ,
82  {
83  "MinComponents", // Id.
84  "Minimum number of components in model", // Name.
85  Integer, // Type.
86  "The minimum number of components that the model must have.", //overview
87  "If not enough components are selected, the model produced will be erroneous or fail. \
88 Usually three or more components are acceptable", // Description.
89  1, // Not zero if the parameter has lower limit.
90  1, // Parameter's lower limit.
91  1, // Not zero if the parameter has upper limit.
92  20, // Parameter's upper limit.
93  "1" // Parameter's typical (default) value.
94  }
95  ,
96  {
97  "VerboseDebugging", // Id.
98  "Show very detailed debugging info", // Name.
99  Integer, // Type.
100  "Warning this will cause a large amount of information to be printed ", //overview
101  "Set this to 1 to show extremely verbose diagnostics. \
102 Set this to 0 to disable verbose diagnostics (this is default behaviour).", // Description.
103  1, // Not zero if the parameter has lower limit.
104  0, // Parameter's lower limit.
105  1, // Not zero if the parameter has upper limit.
106  1, // Parameter's upper limit.
107  "0" // Parameter's typical (default) value.
108  }
109  };
110 
111 
112 /************************************/
113 /*** Algorithm's general metadata ***/
114 
116 
117  "CSMBS", // Id.
118  "Climate Space Model", // Name.
119  "0.4", // Version.
120 
121  "Climate Space Model [CSM] is a principle components based \
122 algorithm developed by Dr. Neil Caithness", //Overview
123 
124  "Climate Space Model [CSM] is a principle components based \
125 algorithm developed by Dr. Neil Caithness. The component \
126 selection process int this algorithm implementation is \
127 based on the Broken-Stick cutoff where any component with \
128 an eigenvalue less than (n stddevs above a randomised sample) is discarded. \n\
129 The original CSM was written as series of Matlab functions. ", //description
130 
131  "Neil Caithness", // Author
132  "Robertson M.P., Caithness N., Villet M.H. (2001) A PCA-based modelling technique for predicting environmental suitability for organisms from presence records. Diversity and Distributions, 7:15-27", // Bibliography.
133 
134  "Tim Sutton, Renato De Giovanni", // Code author.
135  "t.sutton [at] reading.ac.uk", // Code author's contact.
136 
137  0, // Does not accept categorical data.
138  0, // Does not need (pseudo)absence points.
139 
140  NUM_PARAM, // Algorithm's parameters.
141  parameters
142 };
143 
144 
145 
146 /****************************************************************/
147 /****************** Algorithm's factory function ****************/
148 
149 OM_ALG_DLL_EXPORT
152 {
153  return new CsmBS();
154 }
155 
156 OM_ALG_DLL_EXPORT
157 AlgMetadata const *
159 {
160  return &metadata;
161 }
162 
163 /****************************************************************/
164 /****************************** Csm *****************************/
165 
170 CsmBS::CsmBS() : Csm(&metadata)
171 {
172  _initialized = 0;
173  //setup gsl random number generator
174  const gsl_rng_type * myRngType;
175  gsl_rng_env_setup();
176  myRngType = gsl_rng_default;
177  _randomNumberGenerator = gsl_rng_alloc(myRngType);
178  //seed the random number generator
179  gsl_rng_set(_randomNumberGenerator, (unsigned) time( NULL ));
180 }
181 
182 
185 {
186  gsl_rng_free(_randomNumberGenerator);
187 }
188 
189 
191 {
192  Log::instance()->debug( "Starting CSM - Broken Stick\n" );
193  //set up parameters
194  if ( ! getParameter( "Randomisations", &numberOfRandomisationsInt ) )
195  {
196  Log::instance()->warn( "Parameter Randomisations not set.\n");
197  return 0;
198  }
199  if ( ! getParameter( "StandardDeviations", &numberOfStdDevsFloat) )
200  {
201  Log::instance()->warn( "Parameter StandardDeviations not set.\n");
202  return 0;
203  }
204  if ( ! getParameter( "MinComponents", &minComponentsInt) )
205  {
206  Log::instance()->warn( "Parameter MinComponents not set.\n");
207  return 0;
208  }
209  int myTempInt=0;
210  if ( ! getParameter( "VerboseDebugging", &myTempInt) )
211  {
212  Log::instance()->warn( "Verbose debugging parameter not set.\n");
213  return 0;
214  }
215 
216  Log::instance()->debug( "Randomisations parameter set to: %d\n", numberOfRandomisationsInt );
217  Log::instance()->debug( "StandardDeviations parameter set to: %.4f\n", numberOfStdDevsFloat );
218  Log::instance()->debug( "MinComponents parameter set to: %d\n", minComponentsInt );
219 
220  if ( ! ( myTempInt == 0 || myTempInt == 1 ) )
221  {
222  Log::instance()->warn( "CSM - Broken Stick - Verbose debugging parameter out of range: %d\n",
223  myTempInt);
224  return 0;
225  }
226 
227  if (myTempInt!=0)
228  {
230  Log::instance()->debug( "VerboseDebugging parameter set to: TRUE\n" );
231  }
232  else
233  {
234  verboseDebuggingBool=false;
235  Log::instance()->debug( "VerboseDebugging parameter set to: FALSE \n" );
236  }
237 
238  if ( numberOfRandomisationsInt <= 0 || numberOfRandomisationsInt > 1000 )
239  {
240  Log::instance()->warn( "CSM - Broken Stick - Randomisations parameter out of range: %f\n",
242  return 0;
243  }
244  if ( numberOfStdDevsFloat<= -10 || numberOfStdDevsFloat> 10 )
245  {
246  Log::instance()->warn( "CSM - Broken Stick - StandardDeviations parameter out of range: %f\n",
248  return 0;
249  }
250  if (minComponentsInt < 1 ||minComponentsInt > 20 )
251  {
252  Log::instance()->warn( "CSM - Broken Stick - MinComponents parameter out of range: %f\n",
254  return 0;
255  }
256 
257  //call the superclass initialier now...
258  return Csm::initialize();
259 }
260 
261 gsl_matrix * CsmBS::createRandomMatrix(int size1, int size2)
262 {
263 
264  //populate the matrix with random nos
265  gsl_matrix * m = gsl_matrix_alloc (size1, size2);
266  for (unsigned int j=0; j < m->size2; j++)
267  {
268  for (unsigned int k=0; k < m->size1; k++)
269  {
270  double myNo = gsl_rng_uniform_pos(_randomNumberGenerator);
271  gsl_matrix_set(m,k,j,myNo);
272  }
273  }
274  return m;
275 }
276 
277 /* randomise the data within each column */
278 gsl_matrix * CsmBS::randomiseColumns(gsl_matrix * original_matrix)
279 {
280  //for debugging only
281  //gsl_matrix * myIndexMatrix = gsl_matrix_alloc (original_matrix->size1, original_matrix->size2);
282  gsl_matrix * myOuputMatrix = gsl_matrix_alloc (original_matrix->size1, original_matrix->size2);
283  gsl_matrix * m = createRandomMatrix (original_matrix->size1, original_matrix->size2);
284  //loop through the matrix columns
285  for (unsigned int j=0; j < m->size2; j++)
286  {
287  //get the column of random numbers
288  gsl_vector * myRandomColumnVector = gsl_vector_alloc (m->size1);
289  gsl_vector * myOriginalColumnVector = gsl_vector_alloc (original_matrix->size1);
290  gsl_matrix_get_col (myRandomColumnVector, m, j);
291  gsl_matrix_get_col (myOriginalColumnVector, original_matrix, j);
292  gsl_permutation * myPermutation = gsl_permutation_alloc(m->size1);
293  //compute the index positions of the sorted random col
294  gsl_sort_vector_index(myPermutation,myRandomColumnVector);
295  //assign values in the output array based on their index relative to the sorted
296  //randomised matrix
297  for (unsigned int k=0; k < m->size1; k++)
298  {
299  double myDouble = gsl_vector_get(myOriginalColumnVector,myPermutation->data[k]);
300  //for debugging only
301  //gsl_matrix_set(myIndexMatrix,k,j,myPermutation->data[k]);
302  //write the elemnt straight into the output matrix
303  gsl_matrix_set(myOuputMatrix,k,j,myDouble);
304  } //k loop
305  //set the output column to the randomly sorted column
306  //clean up
307  gsl_permutation_free (myPermutation);
308  gsl_vector_free (myRandomColumnVector);
309  gsl_vector_free (myOriginalColumnVector);
310  }//j loop
311  gsl_matrix_free(m);
312  //for debuggin only
313  //displayMatrix(myIndexMatrix,"Random matrix indexes");
314  //gsl_matrix_free(myIndexMatrix);
315  return myOuputMatrix;
316 }
317 
319 {
320  Log::instance()->debug( "Discarding components\n" );
321 
322  //create a matrix that will store the eigenvalue vector of each of the
323  //the randomised environment variables we create
324  gsl_matrix * myMatrixOfEigenValueVectors =
325  gsl_matrix_alloc (numberOfRandomisationsInt,_gsl_environment_matrix->size2);
326  Log::instance()->debug( "Calculating %i randomised matrices...\n", numberOfRandomisationsInt );
327  for (int i=0; i<numberOfRandomisationsInt;i++)
328  {
329  Log::instance()->debug( "Calculating randomised matrix: %i\n", i );
330 
331  //
332  //retreive centered and standardised environmental matrix & clone it
333  //
334 
335  // displayMatrix(_gsl_environment_matrix,"Cloning centered and standardised matrix:");
336  gsl_matrix * m = randomiseColumns(_gsl_environment_matrix);
337  //displayMatrix(m,"Randomised matrix:");
338  //
339  // build a cov matrix on the randomised environmental matrix
340  //
341  gsl_matrix * myGslCovarianceMatrix = autoCovariance(m);
342 
343  // obtain eigenvalue and eigenvector on the new cov matrix
344  gsl_vector * myGslEigenValueVector = gsl_vector_alloc (m->size2);
345  gsl_matrix * myGslEigenVectorMatrix = gsl_matrix_alloc (m->size2, m->size2);
346  //create a temporary workspace
347  gsl_eigen_symmv_workspace * myWorkpace = gsl_eigen_symmv_alloc (m->size2);
348  gsl_eigen_symmv (myGslCovarianceMatrix,
349  myGslEigenValueVector,
350  myGslEigenVectorMatrix,
351  myWorkpace);
352  //free the temporary workspace again
353  gsl_eigen_symmv_free (myWorkpace);
354  displayVector(myGslEigenValueVector, "Randomised eigenvaluevector");
355  //
356  // sort the eigenvalues from > to < then discard eigenvector
357  //
358  gsl_eigen_symmv_sort (myGslEigenValueVector, myGslEigenVectorMatrix,
359  GSL_EIGEN_SORT_VAL_DESC);
360 
361  // keep only the eigen values and add them as a new row to a matrix
362  // int gsl_matrix_set_row (gsl_matrix * m, size_t i, const gsl_vector * v)
363  gsl_matrix_set_row (myMatrixOfEigenValueVectors,i,myGslEigenValueVector);
364 
365  // clean up temp vectors and matrix
366  gsl_vector_free (myGslEigenValueVector);
367  gsl_matrix_free(myGslEigenVectorMatrix);
368  // clear temp covariance matrix
369  gsl_matrix_free(myGslCovarianceMatrix);
370  gsl_matrix_free (m);
371  // repeat as many times as numberOfRandomisationsInt, ading a new row to the matrix each time
372  }//i loop
373 
374  // now calculate the mean and stddev of each col of output matrix
375  // note we must initialise these vectors before calling mean&stddev fn
376  gsl_vector * myMeanVector = gsl_vector_alloc(myMatrixOfEigenValueVectors->size2);
377  gsl_vector * myStdDevVector = gsl_vector_alloc(myMatrixOfEigenValueVectors->size2);;
378  calculateMeanAndSd( myMatrixOfEigenValueVectors,myMeanVector,myStdDevVector);
379  displayVector (myMeanVector,"Mean of randomised Eigen Vectors");
380  // in a new vector save the mean plus (numberOfStdDeviationsFloat * stddev)
381  gsl_vector * myMeanPlusStdDevsVector = gsl_vector_alloc(myMeanVector->size);
382  for (unsigned int i=0; i<myMeanVector->size; ++i)
383  {
384  double myMean = gsl_vector_get (myMeanVector,i);
385  double myStdDev = gsl_vector_get (myStdDevVector,i);
386  gsl_vector_set(myMeanPlusStdDevsVector,i,myMean+(myStdDev*numberOfStdDevsFloat));
387  }
388  displayVector (myMeanPlusStdDevsVector,"Mean of randomised Eigen Vectors plus std deviation");
389  // First determine how many components we are going to keep
390  // We do this by iterating through the eigenvalues, checking which are above the mean+stddev
391 
392  //
393  // Note: as soon as the first non kept compoinent is encountered, all
394  // remaining components are discarded. This is because on 64bit
395  // platform it can happen that some component scores at the 'small'
396  // end of the components have a value above the random threshold
397  // which is not desired.
398  //
399 
400  double sumOfEigenValues = 0;//sum should total number of layers
402  for (unsigned int i=0; i<myMeanPlusStdDevsVector->size; ++i)
403  {
404  double cmpValue = gsl_vector_get(myMeanPlusStdDevsVector,i);
405  sumOfEigenValues += gsl_vector_get(_gsl_eigenvalue_vector,i);
406  if (cmpValue < gsl_vector_get(_gsl_eigenvalue_vector,i))
407  {
409  //std::cerr << gsl_vector_get(_gsl_eigenvalue_vector,i) << " > " << cmpValue << ": Component "
410  // << i << " is greater than randomised component... retaining it." << std::endl;
411  }
412  else
413  {
414  break;
415  }
416  }
417  //std::cerr << "Sum of eigenvalues is " << sumOfEigenValues << " (layer count is " << _layer_count << ")\n";
418  //std::cerr << "Difference between sum of eigenvalues and layer count = number of invariant layers" << std::endl;
420  {
421  Log::instance()->debug( "Only %i component(s) retained. %i required. \nAborting discard components routine\n",_retained_components_count, minComponentsInt );
422  gsl_vector_free (myMeanVector);
423  gsl_vector_free (myStdDevVector);
424  gsl_vector_free (myMeanPlusStdDevsVector);
425  gsl_matrix_free (myMatrixOfEigenValueVectors);
426  return 0;
427  }
428 
429  // this vector is then used to discard component from the eigenvalue vector where
430  // the eigen value is greater than the randomised eigen value
431 
432  //so now create a local copy of the eigvect and eigval...
433  //first clone the vector and matrix...
434  gsl_vector * tmp_gsl_eigenvalue_vector = gsl_vector_alloc (_layer_count);
435  gsl_matrix * tmp_gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _layer_count);
436 
437  gsl_matrix_memcpy (tmp_gsl_eigenvector_matrix, _gsl_eigenvector_matrix);
438  gsl_vector_memcpy (tmp_gsl_eigenvalue_vector,_gsl_eigenvalue_vector);
439 
440  //now clear our current eig vec and matrix
441 
442  gsl_vector_free (_gsl_eigenvalue_vector);
443  gsl_matrix_free (_gsl_eigenvector_matrix);
444 
445  //next we reassign them to the reduced size (ie without unwanted components)
448 
449  //now copy over just the components we intend to keep into the blank resized
450  //_gsl_eigenvalue_vector and _gsl_eigenvector_matrix
451  int myCurrentColInt=0;
452  for (int j=0;j<_retained_components_count;j++)
453  {
454  double cmpValue = gsl_vector_get (tmp_gsl_eigenvalue_vector,j);
455  if (cmpValue > gsl_vector_get (myMeanPlusStdDevsVector,j))
456  {
457  //Good this is a component we want to keep
458  //...first copy over the vector value we are retaining
459  gsl_vector_set (_gsl_eigenvalue_vector,myCurrentColInt,cmpValue);
460  //...next copy over the associated matrix column for the component we are retaining
461  gsl_vector * tmp_gsl_vector = gsl_vector_alloc (_layer_count);
462  gsl_matrix_get_col (tmp_gsl_vector, tmp_gsl_eigenvector_matrix, j);
463  gsl_matrix_set_col (_gsl_eigenvector_matrix,myCurrentColInt,tmp_gsl_vector);
464  gsl_vector_free (tmp_gsl_vector);
465  //increment the column counter
466  myCurrentColInt++;
467  }//if check for retained componnet
468  }//j loop
469  //now clear away the temporary vars
470  gsl_vector_free (tmp_gsl_eigenvalue_vector);
471  gsl_matrix_free (tmp_gsl_eigenvector_matrix);
472 
473  //clean up
474  gsl_vector_free (myMeanVector);
475  gsl_vector_free (myStdDevVector);
476  gsl_vector_free (myMeanPlusStdDevsVector);
477  gsl_matrix_free (myMatrixOfEigenValueVectors);
478 
479  displayVector( _gsl_eigenvalue_vector, "Vector of retained eigen values:");
480  displayMatrix( _gsl_eigenvector_matrix,"Matrix of retained eigen vector:");
481 
482  Log::instance()->debug( "Completed CSM - Broken Stick\n" );
483  Log::instance()->debug( "%i out of %i components retained \n", _retained_components_count, _layer_count );
484  return 1;
485 }
float numberOfStdDevsFloat
Definition: csmbs.hh:92
void warn(const char *format,...)
'Warn' level.
Definition: Log.cpp:273
OM_ALG_DLL_EXPORT AlgMetadata const * algorithmMetadata()
Definition: csmbs.cpp:158
gsl_matrix * _gsl_eigenvector_matrix
Definition: csm.hh:442
virtual int initialize()
Definition: csm.cpp:98
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
gsl_rng * _randomNumberGenerator
Definition: csmbs.hh:97
int calculateMeanAndSd(gsl_matrix *theMatrix, gsl_vector *theMeanVector, gsl_vector *theStdDevVector)
Definition: csm.cpp:175
#define NUM_PARAM
Definition: csmbs.cpp:40
Definition: csmbs.hh:42
bool verboseDebuggingBool
Definition: csm.hh:453
int _retained_components_count
Definition: csm.hh:446
gsl_vector * _gsl_eigenvalue_vector
Definition: csm.hh:440
int getParameter(std::string const &name, std::string *value)
int initialize()
Definition: csmbs.cpp:190
void displayMatrix(const gsl_matrix *m, const char *name, const bool roundFlag=true) const
Definition: csm.cpp:451
static AlgParamMetadata parameters[NUM_PARAM]
Definition: csmbs.cpp:46
int _layer_count
Definition: csm.hh:444
Definition: csm.hh:269
gsl_matrix * autoCovariance(gsl_matrix *m)
Definition: csm.cpp:578
gsl_matrix * randomiseColumns(gsl_matrix *original_matrix)
Definition: csmbs.cpp:278
void displayVector(const gsl_vector *v, const char *name, const bool roundFlag=true) const
Definition: csm.cpp:413
int _initialized
Definition: csm.hh:423
int minComponentsInt
Definition: csm.hh:451
~CsmBS()
Definition: csmbs.cpp:184
OM_ALG_DLL_EXPORT AlgorithmImpl * algorithmFactory()
Definition: csmbs.cpp:151
static AlgMetadata metadata
Definition: csmbs.cpp:115
gsl_matrix * _gsl_environment_matrix
Definition: csm.hh:430
gsl_matrix * createRandomMatrix(int size1, int size2)
Definition: csmbs.cpp:261
CsmBS()
Definition: csmbs.cpp:170
int numberOfRandomisationsInt
Definition: csmbs.hh:95
int discardComponents()
Definition: csmbs.cpp:318
void debug(const char *format,...)
'Debug' level.
Definition: Log.cpp:237