openModeller
Version 1.4.0
|
00001 // 00002 // Enfa 00003 // 00004 // Description: 00005 // 00006 // 00007 // Author: CRIA <chris.yesson@ioz.ac.uk>, (C) 2009 00008 // 00009 // Copyright: See COPYING file that comes with this distribution 00010 // 00011 // 00013 00014 #include <string.h> 00015 #include <cassert> 00016 #include <math.h> 00017 #include <gsl/gsl_statistics.h> 00018 #include <gsl/gsl_math.h> 00019 #include <gsl/gsl_permutation.h> 00020 #include <gsl/gsl_matrix.h> 00021 #include <gsl/gsl_eigen.h> 00022 #include <gsl/gsl_blas.h> 00023 #include <gsl/gsl_linalg.h> 00024 #include <gsl/gsl_errno.h> 00025 #include <gsl/gsl_sort_vector.h> 00026 00027 #include "enfa.hh" 00028 00029 00030 /****************************************************************/ 00031 /********************** Algorithm's Metadata ********************/ 00032 00033 #define NUM_PARAM 7 00034 #define BACKGROUND_ID "NumberOfBackgroundPoints" 00035 #define NUM_RETRIES "NumberOfRetries" 00036 #define DISCARD_METHOD "DiscardMethod" 00037 #define RETAIN_COMPONENTS "RetainComponents" 00038 #define RETAIN_VARIATION "RetainVariation" 00039 #define DISPLAY_LOADINGS "DisplayLoadings" 00040 #define VERBOSE_DEBUG "VerboseDebug" 00041 00042 static AlgParamMetadata parameters[NUM_PARAM] = { 00043 00044 { 00045 BACKGROUND_ID, // Id. 00046 "Number of background sample points", // Name. 00047 Integer, // Type. 00048 "Number of background points to be sampled when estimating the mean and standard deviation.", // Overview 00049 "The ENFA algorithm compares the species presence data with the background environment. This requires the calculation of the mean, standard deviation and covariances of each environmental layer. This could be prohibitively slow and expensive for large datasets, so estimate these values by randomly sampling X points from the background data.", // Description. 00050 1, // Not zero if the parameter has lower limit. 00051 10, // Parameter's lower limit. 00052 0, // Not zero if the parameter has upper limit. 00053 0, // Parameter's upper limit. 00054 "10000" // Parameter's typical (default) value. 00055 }, 00056 { 00057 NUM_RETRIES, // Id. 00058 "Number of retries of model", // Name. 00059 Integer, // Type. 00060 "Number of attempted retries in the case that the model generation fails.", // Overview 00061 "The algorithm requires the inversion of several matrices, but sometimes the matrix to invert is singular and so the algorithm fails. This seems to occur when the background data is undersampled or not representative. Often retrying the model generation (i.e. resampling the background data) makes this problem go away.", // Description. 00062 1, // Not zero if the parameter has lower limit. 00063 1, // Parameter's lower limit. 00064 0, // Not zero if the parameter has upper limit. 00065 100, // Parameter's upper limit. 00066 "5" // Parameter's typical (default) value. 00067 }, 00068 { 00069 DISCARD_METHOD, // Id. 00070 "Method for discarding components", // Name. 00071 Integer, // Type. 00072 "Method for discarding components: 0 - Fixed number (set by RETAIN_COMPONENTS), 1 - Minimum components explaining a fixed variation (set by RETAIN_VARIATION), 2 - Broken stick method", // Overview 00073 " 0 - Retain a fixed number of components defined by the variable RETAIN_COMPONENTS\n 1 - Retain the top N components that cumulatively explain the level of variation defined by the RETAIN_VARIATION variable\n 2 - Compare the observed explanation of variation to the broken stick distribution retaining those components explaining higher levels of variation", // Description. 00074 1, // Not zero if the parameter has lower limit. 00075 0, // Parameter's lower limit. 00076 1, // Not zero if the parameter has upper limit. 00077 2, // Parameter's upper limit. 00078 "2" // Parameter's typical (default) value. 00079 }, 00080 { 00081 RETAIN_COMPONENTS, // Id. 00082 "Number of components to retain", // Name. 00083 Integer, // Type. 00084 "Specify the number of components to retain (only for DISCARD_METHOD=0)", // Overview 00085 "If the Discard_method=0, then this variable is used to determine the number of components to retain.", // Description. 00086 1, // Not zero if the parameter has lower limit. 00087 1, // Parameter's lower limit. 00088 0, // Not zero if the parameter has upper limit. 00089 100, // Parameter's upper limit. 00090 "2" // Parameter's typical (default) value. 00091 }, 00092 { 00093 RETAIN_VARIATION, // Id. 00094 "Percent varition for component retention", // Name. 00095 Integer, // Type. 00096 "Specify the amount of variation that the retained components should explain (only for DISCARD_METHOD=1)", // Overview 00097 "If the Discard_method=1, then this variable is used to determine the number of components to retain, by taking those components that cumulatively account for at least this much variation.", // Description. 00098 1, // Not zero if the parameter has lower limit. 00099 0.5, // Parameter's lower limit. 00100 1, // Not zero if the parameter has upper limit. 00101 1.0, // Parameter's upper limit. 00102 "0.75" // Parameter's typical (default) value. 00103 }, 00104 { 00105 DISPLAY_LOADINGS, // Id. 00106 "Display variable loadings for each factor", // Name. 00107 Integer, // Type. 00108 "Output a comma separated matrix of variable loadings for each retained factor", // Overview 00109 "Set to 1 to display the matrix. Var=variable, Marg=Marginality (factor 0), Sp-1=Specialisation factor 1, etc. Variables are numbered in the order they are input in the request file.", // Description. 00110 1, // Not zero if the parameter has lower limit. 00111 0, // Parameter's lower limit. 00112 1, // Not zero if the parameter has upper limit. 00113 1, // Parameter's upper limit. 00114 "0" // Parameter's typical (default) value. 00115 }, 00116 { 00117 VERBOSE_DEBUG, // Id. 00118 "Verbose printing for debugging", // Name. 00119 Integer, // Type. 00120 "Print lots of details", // Overview 00121 "", // Description. 00122 1, // Not zero if the parameter has lower limit. 00123 0, // Parameter's lower limit. 00124 1, // Not zero if the parameter has upper limit. 00125 1, // Parameter's upper limit. 00126 "0" // Parameter's typical (default) value. 00127 }, 00128 }; 00129 00130 /****************************************************************/ 00131 /****************************** Enfa *****************************/ 00132 static AlgMetadata metadata = { // General metadata 00133 "ENFA", // Id 00134 "ENFA (Ecological-Niche Factor Analysis)", // Name 00135 "0.1.3", // Version 00136 "Algorithm based on presence only data using a modified principal components analysis.", // Overview 00137 "Ecological-Niche Factor Analysis (Hirzel et al, 2002) uses a modified principal components analysis to develop a model based on presence only data. The observed environment is compared to the background data of the study area (note that absence points in the occurrence file are treated as background data). The analysis produces factors similar to a PCA. The first factor is termed the 'marginality' of the species, marginality is defined as the ecological distance between the species optimum and the mean habitat within the background data. Other factors are termed the 'specialization', and are defined as the ratio of the ecological variance in mean habitat to that observed for the target species. Model projection uses the geomeans method of Hirzel & Arlettaz (2003)", // Description 00138 "Hirzel, A.H.; Hausser, J.; Chessel, D. & Perrin, N.", // Algorithm author 00139 "Hirzel, A.H.; Hausser, J.; Chessel, D. & Perrin, N. Ecological-niche factor analysis: How to compute habitat-suitability maps without absence data? Ecology, 2002, 83, 2027-2036\nHirzel, A.H & Arlettaz, R. Modeling habitat suitability for complex species distributions by environmental distance geometric mean Environmental Management, 2003, 32, 614-623\n", // Bibliography 00140 "Chris Yesson", // Code author 00141 "chris.yesson [at] ioz.ac.uk", // Code author's contact 00142 0, // Does not accept categorical data 00143 0, // Does not need (pseudo)absence points 00144 NUM_PARAM, parameters // Algorithm's parameters 00145 }; 00146 00147 00148 00149 /****************************************************************/ 00150 /****************** Algorithm's factory function ****************/ 00151 00152 OM_ALG_DLL_EXPORT 00153 AlgorithmImpl * 00154 algorithmFactory() 00155 { 00156 return new Enfa(); 00157 } 00158 00159 OM_ALG_DLL_EXPORT 00160 AlgMetadata const * 00161 algorithmMetadata() 00162 { 00163 return &metadata; 00164 } 00165 00166 00171 Enfa::Enfa() : 00172 AlgorithmImpl( &metadata ) 00173 { 00174 _initialized = 0; 00175 _gsl_environment_matrix = 0; 00176 _gsl_environment_factor_matrix = 0; 00177 _gsl_background_matrix = 0; 00178 _gsl_covariance_matrix = 0; 00179 _gsl_covariance_background_matrix = 0; 00180 _gsl_avg_vector = 0; 00181 _gsl_stddev_vector = 0; 00182 _gsl_avg_background_vector = 0; 00183 _gsl_stddev_background_vector = 0; 00184 _gsl_eigenvalue_vector = 0; 00185 _gsl_eigenvector_matrix = 0; 00186 _gsl_score_matrix = 0; 00187 _gsl_covariance_matrix_root_inverse = 0; 00188 _gsl_workspace_H = 0; 00189 _gsl_workspace_W = 0; 00190 _gsl_workspace_y = 0; 00191 _gsl_workspace_z = 0; 00192 _gsl_factor_weights_all_components = 0; 00193 _gsl_factor_weights = 0; 00194 _gsl_geomean_vector = 0; 00195 } 00196 00197 00199 Enfa::~Enfa() 00200 { 00201 if ( _initialized ) 00202 { 00203 if (_gsl_background_matrix) 00204 { 00205 gsl_matrix_free (_gsl_background_matrix); 00206 } 00207 if (_gsl_covariance_background_matrix) 00208 { 00209 gsl_matrix_free (_gsl_covariance_background_matrix); 00210 } 00211 if (_gsl_covariance_matrix) 00212 { 00213 gsl_matrix_free (_gsl_covariance_matrix); 00214 } 00215 if (_gsl_covariance_matrix_root_inverse) 00216 { 00217 gsl_matrix_free (_gsl_covariance_matrix_root_inverse); 00218 } 00219 if (_gsl_eigenvector_matrix) 00220 { 00221 gsl_matrix_free (_gsl_eigenvector_matrix); 00222 } 00223 if (_gsl_environment_factor_matrix) 00224 { 00225 gsl_matrix_free (_gsl_environment_factor_matrix); 00226 } 00227 if (_gsl_environment_matrix) 00228 { 00229 gsl_matrix_free (_gsl_environment_matrix); 00230 } 00231 if (_gsl_score_matrix) 00232 { 00233 gsl_matrix_free (_gsl_score_matrix); 00234 } 00235 if (_gsl_workspace_H) 00236 { 00237 gsl_matrix_free (_gsl_workspace_H); 00238 } 00239 if (_gsl_workspace_W) 00240 { 00241 gsl_matrix_free (_gsl_workspace_W); 00242 } 00243 if (_gsl_workspace_y) 00244 { 00245 gsl_matrix_free (_gsl_workspace_y); 00246 } 00247 if (_gsl_avg_vector) 00248 { 00249 gsl_vector_free (_gsl_avg_vector); 00250 } 00251 if (_gsl_avg_background_vector) 00252 { 00253 gsl_vector_free (_gsl_avg_background_vector); 00254 } 00255 if (_gsl_eigenvalue_vector) 00256 { 00257 gsl_vector_free (_gsl_eigenvalue_vector); 00258 } 00259 if (_gsl_stddev_vector) 00260 { 00261 gsl_vector_free (_gsl_stddev_vector); 00262 } 00263 if (_gsl_stddev_background_vector) 00264 { 00265 gsl_vector_free (_gsl_stddev_background_vector); 00266 } 00267 if (_gsl_workspace_z) 00268 { 00269 gsl_vector_free (_gsl_workspace_z); 00270 } 00271 if (_gsl_factor_weights) 00272 { 00273 gsl_vector_free (_gsl_factor_weights); 00274 } 00275 if (_gsl_factor_weights_all_components) 00276 { 00277 gsl_vector_free (_gsl_factor_weights_all_components); 00278 } 00279 if (_gsl_geomean_vector) 00280 { 00281 gsl_vector_free (_gsl_geomean_vector); 00282 } 00283 } 00284 } 00285 00286 00287 // 00288 // Methods used to build the model 00289 // 00290 00291 00292 00293 00301 int Enfa::initialize() 00302 { 00303 _initialized = 1; 00304 00305 Log::instance()->info( "Base ENFA class initializing\n" ); 00306 00307 //set the class member that holds the number of environmental variables 00308 //we subtract 1 because the first column contains the specimen count 00309 _layer_count = _samp->numIndependent(); 00310 //set the class member that holds the number of occurences 00311 _localityCount = _samp->numPresence(); 00312 //Log::instance()->info( "Checking more than 1 samples exist\n" ); 00313 if (_localityCount<2) 00314 { 00315 Log::instance()->warn( "ENFA needs at least two occurrence points..aborting...\n" ); 00316 _initialized = 0; 00317 return 0; 00318 } 00319 00320 // _backgroundCount = _backgroundSamp->numOccurrences(); 00321 //if ( _samp->numAbsence() ) 00322 if ( _samp->numAbsence() ) 00323 { 00324 _backgroundProvided=1; 00325 _backgroundCount = _samp->numAbsence(); 00326 Log::instance()->info( "Using background data provided (%i records marked as absences in the occurrence file)\n",_backgroundCount); 00327 00328 } 00329 else 00330 { 00331 getParameter( BACKGROUND_ID, &_backgroundCount ); 00332 _backgroundProvided=0; 00333 } 00334 00335 //use the pseudoabsence generator to do this 00336 getParameter( NUM_RETRIES, &_numRetries ); 00337 getParameter( DISCARD_METHOD, &_discardMethod); 00338 getParameter( RETAIN_COMPONENTS, &_retainComponents); 00339 getParameter( RETAIN_VARIATION, &_retainVariation); 00340 getParameter( DISPLAY_LOADINGS, &_displayLoadings); 00341 getParameter( VERBOSE_DEBUG, &_verboseDebug); 00342 00343 /* sometimes our random background samples produce singular matrices 00344 so catch this exception and restart the model */ 00345 00346 _retryCount=0; 00347 bool myFlag=false; 00348 00349 while (_retryCount<_numRetries && !myFlag) 00350 { 00351 00352 _retryCount+=1; 00353 try 00354 { 00355 00356 //convert the sampler to a matrix and store in the local gsl_matrix 00357 //Log::instance()->info( "Converting samples to GSL_Matrix\n" ); 00358 if (!SamplerToMatrix()) 00359 { 00360 Log::instance()->warn( "All occurences are outside the masked area!\n" ); 00361 _initialized = 0; 00362 return 0; 00363 } 00364 00365 if (!BackgroundToMatrix()) 00366 { 00367 Log::instance()->warn( "Failed to sample background data!\n" ); 00368 _initialized = 0; 00369 return 0; 00370 } 00371 00372 myFlag = enfa1(); 00373 00374 } 00375 catch (InverseFailedException& exception) 00376 { 00377 UNUSED(exception); 00378 Log::instance()->warn( "Model failed, retry number %i\n", _retryCount ); 00379 myFlag=false; 00380 } 00381 00382 } 00383 00384 if (! myFlag) 00385 { 00386 Log::instance()->warn( "Model initialization failed!\n" ); 00387 } 00388 00389 return myFlag; 00390 } 00391 00396 int Enfa::SamplerToMatrix() 00397 { 00398 if (_localityCount < 1) 00399 { 00400 return 0; 00401 } 00402 00403 OccurrencesImpl::const_iterator pit = _samp->getPresences()->begin(); 00404 OccurrencesImpl::const_iterator fin = _samp->getPresences()->end(); 00405 00406 // Allocate the gsl matrix to store environment data at each locality 00407 _gsl_environment_matrix = gsl_matrix_alloc (_localityCount, _layer_count); 00408 00409 // now populate the gsl matrix from the sample data 00410 for (int i=0; pit != fin; ++pit, ++i) 00411 { 00412 for (int j=0;j<_layer_count;j++) 00413 { 00414 //we add one to j in order to omit the specimen count column 00415 double myCellValue = (double)(*pit)->environment()[j]; 00416 gsl_matrix_set (_gsl_environment_matrix,i,j,myCellValue); 00417 } 00418 } 00419 00420 //Log::instance()->info( "Enfa::SampleToMatrix: x: %i y: %i\n",_layer_count,_localityCount ); 00421 return 1; 00422 } 00423 00424 // sample background data and store it in a gsl_matrix 00425 int Enfa::BackgroundToMatrix() 00426 { 00427 00428 //Log::instance()->info("Enfa:BackgroundToMatrix:Generating background samples.\n" ); 00429 00430 // try getting all background samples at once using the absence generator 00431 // this enables us to ensure geographic uniqueness 00432 // question: what happens when num-background > number of cells in env layer? 00433 // allow user to provide background points using the absence parser 00434 OccurrencesPtr _ocbg; 00435 OccurrencesImpl::const_iterator pit; 00436 OccurrencesImpl::const_iterator fin; 00437 00438 if (_backgroundProvided==1) 00439 { 00440 _ocbg = _samp->getAbsences(); 00441 } 00442 else 00443 { 00444 _ocbg = _samp->getPseudoAbsences(_backgroundCount, 0, 1, false, false); 00445 } 00446 00447 pit = _ocbg->begin(); 00448 fin = _ocbg->end(); 00449 00450 // Allocate the gsl matrix to store environment data at each background point 00451 _gsl_background_matrix = gsl_matrix_alloc (_backgroundCount, _layer_count); 00452 // now populate the gsl matrix from the sample data 00453 for (int i=0; pit != fin; ++pit, ++i) 00454 { 00455 for (int j=0;j<_layer_count;j++) 00456 { 00457 //we add one to j in order to omit the specimen count column 00458 double myCellValue = (double)(*pit)->environment()[j]; 00459 gsl_matrix_set (_gsl_background_matrix,i,j,myCellValue); 00460 } 00461 } 00462 00463 return 1; 00464 } 00465 00466 00467 00469 int Enfa::calculateMeanAndSd(gsl_matrix * theMatrix, 00470 gsl_vector * theMeanVector, 00471 gsl_vector * theStdDevVector) 00472 { 00473 #ifndef WIN32 00474 assert (theMatrix != 0); 00475 assert (theMeanVector !=0); 00476 assert (theStdDevVector !=0); 00477 #endif 00478 //Initialise the vector to hold the mean of each column 00479 gsl_vector_set_zero(theMeanVector); 00480 00481 //Initialise the vector to hold the stddev of each column 00482 gsl_vector_set_zero(theStdDevVector); 00483 00484 //calculate the mean and stddev of each column 00485 for (int j = 0; j < _layer_count; j++) 00486 { 00487 //get the current column from the array as a vector 00488 gsl_vector_view myColumn = gsl_matrix_column (theMatrix, j); 00489 //calculate the average for the column ... 00490 double myAverage = gsl_stats_mean (myColumn.vector.data, myColumn.vector.stride, myColumn.vector.size); 00491 // ...and assign it to the jth element in the column means vector 00492 gsl_vector_set (theMeanVector,j,myAverage); 00493 //calculate the stddev for the column and ... 00494 double myStdDev = gsl_stats_sd (myColumn.vector.data, myColumn.vector.stride, myColumn.vector.size); 00495 // ...and assign it to the jth element in the column stddev vector 00496 gsl_vector_set (theStdDevVector,j,myStdDev); 00497 } 00498 00499 return 0; 00500 } 00501 00502 int Enfa::center(gsl_matrix * theMatrix, 00503 int spCount) 00504 { 00505 #ifndef WIN32 00506 assert (theMatrix != 0); 00507 #endif 00508 // 00509 //Subtract the column mean from every value in each column 00510 //Divide each resultant column value by the stddev for that column 00511 //Note that we are walking the matrix column wise 00512 // 00513 //Log::instance()->info( "Centering data\n" ); 00514 //for (int j=0;j<_layer_count;j++) 00515 int msize=theMatrix->size1; 00516 for (int j=0;j<_layer_count;j++) 00517 { 00518 //get the stddev and mean for this column 00519 double myAverage = gsl_vector_get (_gsl_avg_background_vector,j); 00520 double myStdDev = gsl_vector_get (_gsl_stddev_background_vector,j); 00521 00522 for (int i=0;i<msize;++i) 00523 { 00524 double myDouble = gsl_matrix_get (theMatrix,i,j); 00525 if (myStdDev > 0) 00526 { 00527 myDouble = (myDouble-myAverage)/myStdDev; 00528 } 00529 else 00530 { 00531 myDouble=0.0; 00532 } 00533 //update the gsl_matrix with the new value 00534 gsl_matrix_set(theMatrix,i,j,myDouble); 00535 } 00536 } 00537 00538 return 0; 00539 } 00540 00541 00546 int Enfa::iterate() 00547 { 00548 _done=1; 00549 return 1; 00550 } 00551 00552 00559 int Enfa::done() const 00560 { 00561 return _done; 00562 } 00563 00564 00565 // 00566 // Methods used to project the model 00567 // 00568 00569 00574 Scalar Enfa::getValue( const Sample& x ) const 00575 { 00576 int ii, cellcount; 00577 float myFloat; 00578 00579 //Log::instance()->info( "Enfa:getValue:entering getvalue\n" ); 00580 00581 //first thing we do is convert the oM primitive env value array 00582 // to a gsl vector so we can use matrix multplication with it 00583 gsl_vector * tmp_raw_gsl_vector = gsl_vector_alloc (_layer_count); 00584 // center data using the avg and stdev 00585 gsl_vector * tmp_centered_gsl_vector = gsl_vector_alloc(_layer_count); 00586 double m1, m2, m3, m4; 00587 00588 for (ii=0;ii<_layer_count;++ii) 00589 { 00590 myFloat = static_cast<float>(x[ii]); 00591 gsl_vector_set (tmp_raw_gsl_vector,ii,myFloat); 00592 00593 m1=myFloat; 00594 m2=gsl_vector_get(_gsl_avg_background_vector,ii); 00595 m3=gsl_vector_get(_gsl_stddev_background_vector,ii); 00596 m4=(m1-m2)/m3; 00597 //Log::instance()->info( "getValue: m1: %6.2f, m2: %6.2f, m3: %6.2f, m4: %6.2f, \n", m1, m2, m3, m4 ); 00598 gsl_vector_set(tmp_centered_gsl_vector, ii, m4); 00599 } 00600 00601 //if (_verboseDebug) 00602 //{ 00603 //displayVector(tmp_raw_gsl_vector, "getvalue: tmp_raw_gsl_vector", true); 00604 //displayVector(tmp_centered_gsl_vector, "getvalue: tmp_centered_gsl_vector", true); 00605 //} 00606 00607 gsl_vector_free(tmp_raw_gsl_vector); 00608 00609 //vector for factored data 00610 gsl_matrix * tmp_centered_gsl_matrix = gsl_matrix_alloc(1,_layer_count); 00611 for (ii=0;ii<_layer_count;++ii) 00612 gsl_matrix_set(tmp_centered_gsl_matrix,0,ii,gsl_vector_get(tmp_centered_gsl_vector,ii)); 00613 00614 gsl_vector_free(tmp_centered_gsl_vector); 00615 00616 gsl_matrix * tmp_factored_gsl_matrix = gsl_matrix_alloc(1,_layer_count); 00617 00618 // multiply centered data by score matrix to get factored data 00619 //gsl_blas_dgemv (CblasTrans, 1.0, _gsl_score_matrix, tmp_centered_gsl_vector, 0.0, tmp_factored_gsl_vector); 00620 //gsl_blas_dgemv (CblasNoTrans, 1.0, _gsl_score_matrix, tmp_centered_gsl_vector, 0.0, tmp_factored_gsl_vector); 00621 00622 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 00623 1.0, tmp_centered_gsl_matrix, _gsl_score_matrix, 00624 0.0, tmp_factored_gsl_matrix); 00625 00626 gsl_vector* tmp_factored_gsl_vector = gsl_vector_alloc(_layer_count); 00627 for (ii=0;ii<_layer_count;++ii) 00628 gsl_vector_set(tmp_factored_gsl_vector,ii, 00629 gsl_matrix_get(tmp_factored_gsl_matrix,0,ii)); 00630 00631 //if (_verboseDebug) 00632 //displayVector(tmp_factored_gsl_vector, "getvalue: tmp_factored_gsl_vector", true); 00633 00634 // work out geomean from this point to all specimen localites 00635 double tmp_geomean = getGeomean(tmp_factored_gsl_vector); 00636 00637 gsl_matrix_free(tmp_centered_gsl_matrix); 00638 gsl_matrix_free(tmp_factored_gsl_matrix); 00639 gsl_vector_free(tmp_factored_gsl_vector); 00640 00641 //Log::instance()->info( "getValue: geomean %6.2f\n", tmp_geomean ); 00642 00643 00644 /* % OK, now to convert these into habitat suitability values. 00645 % Determine what percentage of the species presence points 00646 % are further away from zero than this point */ 00647 00648 //cellcount=0; 00649 00650 //old version checking every value on unsorted vector 00651 //for (ii=0; ii<_localityCount; ++ii) 00652 // if (gsl_vector_get(_gsl_geomean_vector,ii)>=tmp_geomean) cellcount+=1; 00653 00654 // step up the sorted geomeans vector stop when current geomean is higher 00655 // quickest when most cells are of low suitability 00656 cellcount=_localityCount; 00657 for (ii=_localityCount-1; ii>=0; --ii) 00658 if (gsl_vector_get(_gsl_geomean_vector,ii)<=tmp_geomean) 00659 { 00660 cellcount=_localityCount-ii-1; 00661 break; 00662 } 00663 00664 //Log::instance()->info( "getValue: cellcount %i\n", cellcount ); 00665 Scalar myReturn=(Scalar)cellcount/_localityCount; 00666 00667 //Log::instance()->info( "enfa::getValue %6.2f\n", myReturn ); 00668 00669 return myReturn; 00670 } 00671 00678 int Enfa::getConvergence( Scalar * const val ) const 00679 { 00680 return 0; 00681 } 00682 00683 // 00684 // General Helper Methods 00685 // 00686 00687 00688 void Enfa::displayVector(const gsl_vector * v, const char * name, const bool roundFlag) const 00689 { 00690 if (roundFlag) 00691 { 00692 00693 fprintf( stderr, "\nDisplaying Vector rounded to 4 decimal places '%s' (%u): \n----------------------------------------------\n[ ", name, static_cast<unsigned int>(v->size) ); 00694 } 00695 else 00696 { 00697 fprintf( stderr, "\nDisplaying Vector '%s' (%u): \n----------------------------------------------\n[ ", name, static_cast<unsigned int>(v->size) ); 00698 } 00699 00700 char sep1[] = ", "; 00701 00702 for (unsigned int i=0;i<v->size;++i) 00703 { 00704 if (i == v->size -1) 00705 strcpy(sep1, " ]"); 00706 00707 double myDouble = gsl_vector_get (v,i); 00708 if (roundFlag) 00709 { 00710 fprintf( stderr, "%.4g %s", myDouble, sep1 ); 00711 } 00712 else 00713 { 00714 fprintf( stderr, "%g %s", myDouble, sep1 ); 00715 } 00716 } 00717 fprintf( stderr, "\n----------------------------------------------\n" ); 00718 } 00719 00720 00721 /**********************/ 00722 /**** displayMatrix ***/ 00723 void Enfa::displayMatrix(const gsl_matrix * m, const char * name, const bool roundFlag) const 00724 { 00725 if (!roundFlag) 00726 { 00727 fprintf( stderr, "\nDisplaying Matrix '%s' (%u / %u): \n----------------------------------------------\n[\n", name, static_cast<unsigned int>(m->size1), static_cast<unsigned int>(m->size2) ); 00728 } 00729 else 00730 { 00731 fprintf( stderr, "\nDisplaying Matrix rounded to 6 decimal places '%s' (%u / %u): \n----------------------------------------------\n[\n", name, static_cast<unsigned int>(m->size1), static_cast<unsigned int>(m->size2) ); 00732 } 00733 for (unsigned int i=0;i<m->size1;++i) 00734 { 00735 char sep1[] = ","; 00736 char sep2[] = ";"; 00737 00738 for (unsigned int j=0;j<m->size2;j++) 00739 { 00740 double myDouble = gsl_matrix_get (m,i,j); 00741 00742 if (j == m->size2 -1) 00743 strcpy(sep1, ""); 00744 if (!roundFlag) 00745 { 00746 fprintf( stderr, "%g %s ", myDouble, sep1 ); 00747 } 00748 else 00749 { 00750 fprintf( stderr, "%.6g %s ", myDouble, sep1 ); 00751 } 00752 } 00753 00754 fprintf( stderr, "%s\n", sep2 ); 00755 } 00756 fprintf( stderr, "]\n----------------------------------------------\n" ); 00757 } 00758 00759 /**********************/ 00760 /**** displayMatrix ***/ 00761 void Enfa::displayLoadings(const gsl_matrix * m, const int f) const 00762 { 00763 Log::instance()->info( "Factor Loadings:\n"); 00764 00765 // generate header 00766 Log::instance()->info( "Var,Marg"); 00767 //fprintf( stderr, "Var,Marg"); 00768 00769 for (signed int j=1;j<f;j++) 00770 { 00771 fprintf( stderr, ",Sp-%i", j); 00772 } 00773 00774 fprintf( stderr, "\n"); 00775 00776 for (unsigned int i=0;i<m->size1;++i) 00777 { 00778 Log::instance()->info( "%i",i); 00779 //fprintf( stderr, "%i",i); 00780 for (signed int j=0;j<f;j++) 00781 { 00782 double myDouble = gsl_matrix_get (m,i,j); 00783 00784 fprintf( stderr, ",%3.3f", myDouble); 00785 } 00786 fprintf( stderr, "\n"); 00787 } 00788 } 00789 00790 /***************************************************************** 00791 function to find the 'square-root' of a matrix: 00792 the input matrix must be positive semi-definite symmetric matrix 00793 note that covariance matrices have these properties */ 00794 gsl_matrix* Enfa::sqrtm(gsl_matrix* original_matrix) const 00795 { 00796 int m_size, i, j; 00797 double lambda, v; 00798 00799 /* make a copy of the input matrix */ 00800 // Build a copy of the input matrix to work with 00801 m_size = original_matrix->size1; 00802 00803 gsl_matrix* m = gsl_matrix_alloc (m_size, m_size); 00804 gsl_matrix_memcpy (m, original_matrix); 00805 00806 //create temporary space for eigenvalues, eigenvectors & workspace 00807 gsl_vector* eigval_v = gsl_vector_alloc (m_size); 00808 gsl_matrix* eigvect_m = gsl_matrix_alloc (m_size, m_size); 00809 gsl_eigen_symmv_workspace * temp_v = gsl_eigen_symmv_alloc (m_size); 00810 00811 gsl_eigen_symmv (m, 00812 eigval_v, 00813 eigvect_m, 00814 temp_v); 00815 00816 gsl_eigen_symmv_free(temp_v); 00817 00818 /* allocate and initialise a new matrix temp_m */ 00819 /* this is eigvect_m scaled by the root of the eigenvalues */ 00820 gsl_matrix* temp_m = gsl_matrix_alloc (m_size, m_size); 00821 for (j=0; j<m_size; ++j) { 00822 // ignore very small negative numbers 00823 v=gsl_vector_get(eigval_v,j); 00824 if (v<0 && v>-0.000001) 00825 {lambda=0.0;} 00826 else if (v>0) 00827 {lambda=pow(v,0.5);} 00828 else 00829 { 00830 gsl_matrix_free(temp_m); 00831 gsl_matrix_free(m); 00832 gsl_matrix_free(eigvect_m); 00833 gsl_vector_free(eigval_v); 00834 std::string msg = "Enfa::sqrtm:Cannot calculate square root for matrix - model will fail\n", v; 00835 Log::instance()->error( msg.c_str() ); 00836 throw InverseFailedException( msg.c_str() ); 00837 } 00838 00839 for (i=0; i<m_size; ++i) { 00840 gsl_matrix_set(temp_m, i, j, gsl_matrix_get(eigvect_m,i,j)*lambda); 00841 } 00842 } 00843 00844 /* calculate the square root =temp_m * eigvect' */ 00845 gsl_matrix* _root = gsl_matrix_alloc (m_size, m_size); 00846 // multiply using blas (note transpose for second item) 00847 gsl_blas_dgemm(CblasNoTrans, CblasTrans, 00848 1.0, temp_m, eigvect_m, 00849 0.0, _root); 00850 00851 /* code for checking the root - not really needed */ 00877 gsl_matrix_free(temp_m); 00878 gsl_matrix_free(m); 00879 gsl_matrix_free(eigvect_m); 00880 gsl_vector_free(eigval_v); 00881 00882 return _root; 00883 00884 } 00885 00886 /***************************************************************** 00887 * Calculate the geometric mean of the distance from point v to 00888 all species observation points in factored environmental space 00889 weighting each factor accordingly 00890 *****************************************************************/ 00891 double Enfa::getGeomean(gsl_vector* v) const 00892 { 00893 00894 //Log::instance()->info( "enfa::getGeomean\n" ); 00895 00896 //if (_verboseDebug) 00897 //displayVector(v, "getGeomean input vector v:", true); 00898 00899 gsl_vector * tmp_v = gsl_vector_alloc(_retained_components_count); 00900 gsl_vector * tmp_gsl_factor_weights = gsl_vector_alloc(_retained_components_count); 00901 00902 for (int i=0; i<_retained_components_count; ++i) 00903 { 00904 gsl_vector_set(tmp_v, i,gsl_vector_get(v,i)); 00905 gsl_vector_set(tmp_gsl_factor_weights, i, gsl_vector_get(_gsl_factor_weights,i)); 00906 } 00907 00908 gsl_vector * tmp_distance_vector = gsl_vector_alloc(_retained_components_count); 00909 //gsl_vector * tmp_distance_vector = gsl_vector_alloc(_layer_count); 00910 00911 gsl_vector_view tmp_matrix_row_view; 00912 double tmp_geomean=1.0; 00913 double tmp_dist_workspace; 00914 00915 // loop through localities 00916 for (int i=0; i<_localityCount; ++i) 00917 { 00918 tmp_matrix_row_view = gsl_matrix_row(_gsl_environment_factor_matrix,i); 00919 //displayVector(&tmp_matrix_row_view.vector, "getGeomean: locality vector i", true); 00920 //for (int j=0; j<_layer_count; ++j) 00921 // dont need to calculate beyonde the retained components 00922 for (int j=0; j<_retained_components_count; ++j) 00923 gsl_vector_set(tmp_distance_vector, j, 00924 gsl_vector_get(&tmp_matrix_row_view.vector,j)); 00925 //displayVector(tmp_distance_vector, "getGeomean: locality vector i copy ", true); 00926 00927 // difference between cell and current locality 00928 gsl_vector_sub(tmp_distance_vector,tmp_v); 00929 //gsl_vector_sub(tmp_distance_vector,v); 00930 //displayVector(tmp_distance_vector, "getGeomean: locality vector i subbed ", true); 00931 00932 // square the differences 00933 gsl_vector_mul(tmp_distance_vector, tmp_distance_vector); 00934 //displayVector(tmp_distance_vector, "getGeomean: locality vector i squared ", true); 00935 // multiply by the factor weights 00936 gsl_vector_mul(tmp_distance_vector,tmp_gsl_factor_weights); 00937 //gsl_vector_mul(tmp_distance_vector,_gsl_factor_weights); 00938 // square and sum the values to get the distance 00939 //displayVector(tmp_distance_vector, "getGeomean: locality vector i weighted ", true); 00940 00941 tmp_dist_workspace=pow(gsl_blas_dasum(tmp_distance_vector),0.5); 00942 //Log::instance()->info( "euclidean distance: %6.2f\n", tmp_dist_workspace ); 00943 00944 // accumulate for geometric mean calculation 00945 // if (tmp_dist_workspace!=0) tmp_geomean*=tmp_dist_workspace; 00946 // log transform values to reduce chance of float overflow 00947 if (tmp_dist_workspace!=0) tmp_geomean+=log(tmp_dist_workspace); 00948 //Log::instance()->info( "accumulating distance: %6.2f\n", tmp_geomean ); 00949 } 00950 00951 //Log::instance()->info( "tmp_geomean1: %6.2f\n", tmp_geomean ); 00952 00953 // finally work out geometric mean of distances to the species points 00954 // tmp_geomean=pow(tmp_geomean,(double)1/_localityCount); 00955 // reverse the log transformation 00956 tmp_geomean=exp(tmp_geomean/_localityCount); 00957 00958 //if (_verboseDebug) 00959 // Log::instance()->info( "getgeomean returns: %6.2f\n", tmp_geomean ); 00960 00961 gsl_vector_free(tmp_distance_vector); 00962 gsl_vector_free(tmp_v); 00963 gsl_vector_free(tmp_gsl_factor_weights); 00964 00965 return tmp_geomean; 00966 } 00967 00968 /* calculate the inverse using cholesky decomposition */ 00969 gsl_matrix* Enfa::inverse(gsl_matrix* _m) const 00970 { 00971 // dont crash on error - catch it 00972 gsl_error_handler_t* old_handler = gsl_set_error_handler_off(); 00973 00974 int m_size=_m->size1; 00975 gsl_matrix*_mcopy = gsl_matrix_alloc(m_size, m_size); 00976 gsl_matrix*_inverse = gsl_matrix_alloc(m_size, m_size); 00977 gsl_matrix_set_identity(_inverse); 00978 gsl_matrix_memcpy(_mcopy,_m); 00979 _gsl_vector_view _MI; 00980 00981 int choldecomp; 00982 choldecomp = gsl_linalg_cholesky_decomp(_mcopy); 00983 Log::instance()->debug( "Cholesky decomp result %i\n", choldecomp ); 00984 00985 int cholsvx; 00986 for (int i=0; i<m_size; i++) 00987 { 00988 _MI=gsl_matrix_row(_inverse,i); 00989 cholsvx = gsl_linalg_cholesky_svx(_mcopy, &_MI.vector); 00990 Log::instance()->debug( "Cholesky svx result %i\n", cholsvx ); 00991 //displayVector(&_MI.vector, "&MI.vector", true); 00992 } 00993 00994 00995 // sometimes the inverse fails - check it is correct 00996 gsl_matrix *_testInverse = gsl_matrix_alloc(m_size, m_size); 00997 gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0, 00998 _m,_inverse,0.0,_testInverse); 00999 01000 int mycomp, myval; 01001 for (int i=0; i<m_size; i++) 01002 for (int j=0; j<m_size; j++) 01003 { 01004 // check values equal identity matrix (to accuracy of Xdp) 01005 myval=(int)(ceil(gsl_matrix_get(_testInverse,i,j)*pow((double)10,6)-0.49999999)/pow((double)10,6)); 01006 mycomp = (i == j) ? 1 : 0; 01007 if (myval!=mycomp) 01008 { 01009 Log::instance()->warn( "Enfa::inverse failed to invert matrix!\n" ); 01010 //displayMatrix(_m, "input matrix", true); 01011 //displayMatrix(_inverse, "inverse", true); 01012 //displayMatrix(_testInverse, "check inverse", true); 01013 gsl_matrix_free(_mcopy); 01014 gsl_matrix_free(_testInverse); 01015 std::string msg = "Enfa::inverse failed to invert matrix\n"; 01016 Log::instance()->error( msg.c_str() ); 01017 throw InverseFailedException( msg.c_str() ); 01018 } 01019 } 01020 01021 gsl_matrix_free(_mcopy); 01022 gsl_matrix_free(_testInverse); 01023 01024 // restore error handler 01025 gsl_set_error_handler (old_handler); 01026 01027 return _inverse; 01028 } 01029 01030 /***********************/ 01031 /**** autoCovariance ***/ 01032 /* calculate covariance matrix as cov(<matrix>) matlab/octave function */ 01033 gsl_matrix * Enfa::autoCovariance(gsl_matrix * original_matrix) 01034 { 01035 int j; 01036 01037 int numrows = original_matrix->size1; 01038 int numcols = original_matrix->size2; 01039 01040 // Build a copy of the input matrix to work with 01041 gsl_matrix * m = gsl_matrix_alloc (numrows, numcols); 01042 gsl_matrix_memcpy (m, original_matrix); 01043 01044 01045 if (numrows == 1) 01046 { 01047 gsl_matrix_transpose(m); 01048 } 01049 01050 // compute: ones (n, 1) * sum (x) 01051 gsl_matrix * s = gsl_matrix_alloc (numrows, numcols); 01052 01053 if (numrows == 1) 01054 { 01055 gsl_matrix_memcpy (m, s); 01056 } 01057 else 01058 { 01059 gsl_vector * v = gsl_vector_alloc(numcols); 01060 01061 for (int i = 0; i < numcols; i++) 01062 { 01063 double val = 0.0; 01064 01065 for (j = 0; j < numrows; j++) 01066 { 01067 val += gsl_matrix_get (m, j, i); 01068 } 01069 01070 gsl_vector_set (v, i, val); 01071 01072 for (j = 0; j < numrows; j++) 01073 { 01074 gsl_matrix_set_row (s, j, v); 01075 } 01076 } 01077 gsl_vector_free(v); 01078 } 01079 01080 // divide by "n" 01081 gsl_matrix_scale (s, (double)1/numrows); 01082 01083 // subtract the result from x 01084 gsl_matrix_sub (m, s); 01085 01086 // x / (n - 1) 01087 //gsl_matrix_scale (m, (double)1/(numrows-1)); 01088 01089 // multiply by x' 01090 gsl_matrix * p = gsl_matrix_alloc (numcols, numcols); 01091 gsl_blas_dgemm(CblasTrans,CblasNoTrans,1.0, 01092 m,m,0.0,p); 01093 01094 // scale the result 01095 gsl_matrix_scale (p, (double)1/(numrows-1)); 01096 01097 gsl_matrix_free (m); 01098 gsl_matrix_free (s); 01099 01100 return p; 01101 } 01102 01103 /**********************************************/ 01104 /* decide which components to retain/discard */ 01105 /* using the method defined by _discardMethod */ 01106 /**********************************************/ 01107 int Enfa::discardComponents() const 01108 { 01109 double variationTotal=0; 01110 int returnValue=_layer_count; 01111 01112 // take the top X components 01113 if (_discardMethod==0) 01114 { 01115 Log::instance()->info( "Discarding components with fixed method (0)\n"); 01116 returnValue=_retainComponents; 01117 for (int i=0; i<_retainComponents; ++i) 01118 variationTotal+=gsl_vector_get(_gsl_factor_weights_all_components,i); 01119 } 01120 // take the components that account for X % of variation 01121 else if (_discardMethod==1) 01122 { 01123 Log::instance()->info( "Discarding components with variation method (1) (variation=%6.2f)\n", _retainVariation); 01124 for (int i=0; i<_layer_count; ++i) 01125 { 01126 variationTotal+=gsl_vector_get(_gsl_factor_weights_all_components,i); 01127 if (variationTotal>=_retainVariation) 01128 { 01129 // variation exceeds specified amount - keep these components 01130 returnValue=i+1; 01131 break; 01132 } 01133 } 01134 } 01135 // take the components with variation higher than broken stick distribution 01136 else if (_discardMethod==2) 01137 { 01138 Log::instance()->info("Discarding components: Broken stick method (2)\n"); 01139 /* work out the broken stick distribution of expected eigenvalues 01140 * based on Jackson's formula for the kth eigenvalue 01141 * bk = sum[i=k...p](1/i) where p=number of eigenvalues */ 01142 /* use broken stick only for specialisation factors 01143 as often marginality weight is lower than other factors, 01144 but we always want to keep marginality as the first factor */ 01145 gsl_vector* _brokenStickDist = gsl_vector_alloc(_layer_count-1); 01146 for (int i=0; i<_layer_count-1; ++i) 01147 { 01148 double tmp_total=0; 01149 for (int j=i+1; j<=_layer_count-1; ++j) 01150 tmp_total+=(double)1/j; 01151 gsl_vector_set(_brokenStickDist,i, (double)tmp_total/(_layer_count-1)); 01152 } 01153 01154 //displayVector(_brokenStickDist, "broken stick distribution", true); 01155 01156 double myVariation; 01157 // now compare with the observed variation 01158 for (int i=0; i<_layer_count-1; ++i) 01159 { 01160 //myVariation=gsl_vector_get(_gsl_factor_weights_all_components,i); 01161 // rescale specificity ignoring marginality (first) factor 01162 myVariation=gsl_vector_get(_gsl_factor_weights_all_components,i+1)/(1-gsl_vector_get(_gsl_factor_weights_all_components,0)); 01163 //Log::instance()->info("Orig variation %6.2f, new variation %6.2f\n", gsl_vector_get(_gsl_factor_weights_all_components,i+1), myVariation); 01164 01165 if (myVariation <= gsl_vector_get(_brokenStickDist,i)) 01166 { 01167 if (i==0) // problem - no components selected 01168 { 01169 Log::instance()->warn( "First specialisation component explains less variation than the broken stick distribution - retaining all components by default\n" ); 01170 variationTotal=1; 01171 returnValue=_layer_count; 01172 break; 01173 } 01174 else // this component is discarded but keep earlier ones 01175 { 01176 returnValue=i+1; 01177 break; 01178 } 01179 } 01180 variationTotal+=myVariation; 01181 } 01182 gsl_vector_free(_brokenStickDist); 01183 } 01184 else 01185 { 01186 Log::instance()->warn( "Unknown problem whilst discarding components, retaining all components by default\n"); 01187 variationTotal=1; 01188 } 01189 Log::instance()->info( "Retained components: %i/%i, variation explained: %.2f\n", returnValue, _layer_count, variationTotal); 01190 return returnValue; 01191 } 01192 01193 01194 01196 bool Enfa::enfa1() 01197 { 01198 01199 //calculate the mean and std deviation for background points 01200 //Log::instance()->info( "Calculating mean and stddev for background points\n" ); 01201 _gsl_avg_background_vector = gsl_vector_alloc (_gsl_background_matrix->size2); 01202 _gsl_stddev_background_vector = gsl_vector_alloc (_gsl_background_matrix->size2) ; 01203 calculateMeanAndSd(_gsl_background_matrix,_gsl_avg_background_vector,_gsl_stddev_background_vector); 01204 01205 if (_verboseDebug) 01206 { 01207 displayVector(_gsl_avg_background_vector, "Background Means", true); 01208 displayVector(_gsl_stddev_background_vector, "Background Stdev", true); 01209 } 01210 01211 //calculate the mean and std deviation for centered presence points 01212 _gsl_avg_vector = gsl_vector_alloc (_gsl_environment_matrix->size2); 01213 _gsl_stddev_vector = gsl_vector_alloc (_gsl_environment_matrix->size2) ; 01214 calculateMeanAndSd(_gsl_environment_matrix,_gsl_avg_vector,_gsl_stddev_vector); 01215 01216 if (_verboseDebug) 01217 { 01218 displayVector(_gsl_avg_vector, "Species Means", true); 01219 displayVector(_gsl_stddev_vector, "Species Stdev", true); 01220 } 01221 01222 //center and standardise the data based on background means 01223 center(_gsl_environment_matrix,_localityCount); 01224 center(_gsl_background_matrix,_backgroundCount); 01225 01226 //displayMatrix(_gsl_background_matrix, "_gsl_background_matrix", true); 01227 01228 //calculate the mean and std deviation for centered presence points 01229 calculateMeanAndSd(_gsl_environment_matrix,_gsl_avg_vector,_gsl_stddev_vector); 01230 01231 if (_verboseDebug) 01232 { 01233 displayVector(_gsl_avg_vector, "Species Means - centered", true); 01234 displayVector(_gsl_stddev_vector, "Species Stdev - centered", true); 01235 } 01236 01237 //Now calculate the covariance matrix: 01238 //Log::instance()->info( "Calculating covariance matrix\n" ); 01239 _gsl_covariance_matrix = autoCovariance(_gsl_environment_matrix); 01240 if (_verboseDebug) 01241 displayMatrix(_gsl_covariance_matrix, "Species Covariance Matrix", true); 01242 01243 // now get covariance for background data 01244 _gsl_covariance_background_matrix = autoCovariance(_gsl_background_matrix); 01245 if (_verboseDebug) 01246 displayMatrix(_gsl_covariance_background_matrix, "Background Covariance Matrix", true); 01247 01248 // get root inverse of species covariance matrix 01249 _gsl_covariance_matrix_root_inverse = gsl_matrix_alloc(_gsl_covariance_matrix->size1,_gsl_covariance_matrix->size2); 01250 gsl_matrix* gsl_sqrtm_matrix = sqrtm(_gsl_covariance_matrix); 01251 _gsl_covariance_matrix_root_inverse = inverse(gsl_sqrtm_matrix); 01252 gsl_matrix_free(gsl_sqrtm_matrix); 01253 01254 //z = cov_species^-0.5 * m'; 01255 _gsl_workspace_z = gsl_vector_alloc (_layer_count); 01256 gsl_blas_dgemv (CblasNoTrans, 1.0, _gsl_covariance_matrix_root_inverse, _gsl_avg_vector, 0.0, _gsl_workspace_z); 01257 01258 if (_verboseDebug) 01259 displayVector(_gsl_workspace_z, "z", true); 01260 01261 //W = cov_species^-0.5 * cov_global * cov_species^-0.5; 01262 gsl_matrix* _gsl_workspace_W_temp = gsl_matrix_alloc(_layer_count, _layer_count); 01263 _gsl_workspace_W = gsl_matrix_alloc(_layer_count, _layer_count); 01264 gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0, 01265 _gsl_covariance_matrix_root_inverse, 01266 _gsl_covariance_background_matrix,0.0, 01267 _gsl_workspace_W_temp); 01268 01269 gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0, 01270 _gsl_workspace_W_temp, 01271 _gsl_covariance_matrix_root_inverse,0.0, 01272 _gsl_workspace_W); 01273 01274 gsl_matrix_free(_gsl_workspace_W_temp); 01275 01276 if (_verboseDebug) 01277 displayMatrix(_gsl_workspace_W, "W", true); 01278 01279 //y = z / sqrtm(z' * z); 01280 //require view of z as an Nx1 matrix rather than a vector 01281 gsl_matrix_view _gsl_workspace_z_matrix_view; 01282 _gsl_workspace_z_matrix_view = gsl_matrix_view_vector(_gsl_workspace_z, 01283 1,_layer_count); 01284 double zz; 01285 gsl_blas_ddot(_gsl_workspace_z, _gsl_workspace_z, &zz); 01286 01287 _gsl_workspace_y = gsl_matrix_alloc(1,_layer_count); 01288 gsl_matrix_memcpy(_gsl_workspace_y, &_gsl_workspace_z_matrix_view.matrix); 01289 gsl_matrix_scale(_gsl_workspace_y,pow(zz, -0.5)); 01290 01291 if (_verboseDebug) 01292 displayMatrix(_gsl_workspace_y, "y", true); 01293 01294 //eye(N)=identity matrix with dimension NxN (use gsl_matrix_set_identity()) 01295 //H = (eye(no_egvs) - y * y') * W * (eye(no_egvs) - y * y'); 01296 _gsl_workspace_H = gsl_matrix_alloc(_layer_count,_layer_count); 01297 gsl_matrix* _gsl_workspace_H_temp1 = gsl_matrix_alloc(_layer_count,_layer_count); 01298 gsl_matrix* _gsl_workspace_H_temp2 = gsl_matrix_alloc(_layer_count,_layer_count); 01299 gsl_matrix* _gsl_workspace_H_temp3 = gsl_matrix_alloc(_layer_count,_layer_count); 01300 01301 //initialise identity matrix 01302 gsl_matrix_set_identity(_gsl_workspace_H_temp1); 01303 01304 // first step work out (y * y') 01305 gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, 01306 _gsl_workspace_y, 01307 _gsl_workspace_y, 01308 0.0, _gsl_workspace_H_temp2); 01309 01310 // now subtract from identity 01311 gsl_matrix_sub(_gsl_workspace_H_temp1,_gsl_workspace_H_temp2); 01312 01313 // product of this with W 01314 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, 01315 _gsl_workspace_H_temp1, 01316 _gsl_workspace_W, 01317 0.0, _gsl_workspace_H_temp3); 01318 01319 // finally product of this with temp1 01320 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, 01321 _gsl_workspace_H_temp3, 01322 _gsl_workspace_H_temp1, 01323 0.0, _gsl_workspace_H); 01324 01325 gsl_matrix_free(_gsl_workspace_H_temp1); 01326 gsl_matrix_free(_gsl_workspace_H_temp2); 01327 gsl_matrix_free(_gsl_workspace_H_temp3); 01328 01329 if (_verboseDebug) 01330 displayMatrix(_gsl_workspace_H, "H", true); 01331 01332 //work out eigen-vectors/values of H 01333 _gsl_eigenvalue_vector = gsl_vector_alloc (_layer_count); 01334 gsl_vector * _gsl_eigenvalue_vector_copy = gsl_vector_alloc (_layer_count); 01335 _gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _layer_count); 01336 gsl_eigen_symmv_workspace * _gsl_eigen_workpace = gsl_eigen_symmv_alloc (_layer_count); 01337 gsl_eigen_symmv (_gsl_workspace_H, 01338 _gsl_eigenvalue_vector, 01339 _gsl_eigenvector_matrix, 01340 _gsl_eigen_workpace); 01341 01342 //free the temporary workspace 01343 gsl_eigen_symmv_free (_gsl_eigen_workpace); 01344 01345 //create score_matrix = cov_species^-0.5 * v; 01346 _gsl_score_matrix=gsl_matrix_alloc(_layer_count,_layer_count); 01347 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, 01348 _gsl_covariance_matrix_root_inverse, 01349 _gsl_eigenvector_matrix, 01350 0.0, _gsl_score_matrix); 01351 01352 /* sort eigenvalues, eigenvectors and score matrix */ 01353 gsl_vector * _gsl_eigenvalue_vector_presort = gsl_vector_alloc(_layer_count); 01354 gsl_vector_memcpy(_gsl_eigenvalue_vector_presort, _gsl_eigenvalue_vector); 01355 gsl_eigen_symmv_sort (_gsl_eigenvalue_vector, _gsl_eigenvector_matrix, 01356 GSL_EIGEN_SORT_VAL_DESC); 01357 gsl_eigen_symmv_sort (_gsl_eigenvalue_vector_presort, _gsl_score_matrix, 01358 GSL_EIGEN_SORT_VAL_DESC); 01359 01360 //Log::instance()->info( "Eigenvector sorted\n" ); 01361 01362 /* Replace the min eigenvalue with the difference of the trace of W & H 01363 this becomes eigenvalue 1, the remaining eigenvalues move to index 2+ 01364 t = eigenvalues ~= min(eigenvalues); 01365 eigenvalues(2:no_egvs) = eigenvalues(t); 01366 eigenvalues(1) = trace(W) - trace(H)*/ 01367 01368 _gsl_vector_min = gsl_vector_min_index(_gsl_eigenvalue_vector); 01369 //Log::instance()->info( "min eigevalue: %i\n", _gsl_vector_min); 01370 //Log::instance()->info( "min eigevalue: %6.2f\n", gsl_vector_min(_gsl_eigenvalue_vector)); 01371 01372 gsl_vector_memcpy (_gsl_eigenvalue_vector_copy, 01373 _gsl_eigenvalue_vector); 01374 01375 // note starting at 1 as position 0 will be overwritten later 01376 for (int i=1; i<_layer_count; i++) 01377 { 01378 if (_gsl_vector_min>=i) 01379 { 01380 gsl_vector_set(_gsl_eigenvalue_vector,i, 01381 gsl_vector_get(_gsl_eigenvalue_vector_copy,i-1)); 01382 } 01383 } 01384 01385 //displayVector(_gsl_eigenvalue_vector, "_gsl_eigenvalue_vector:A", true); 01386 //displayVector(_gsl_eigenvalue_vector_copy, "_gsl_eigenvalue_vector_copy:A", true); 01387 01388 gsl_vector_free(_gsl_eigenvalue_vector_copy); 01389 01390 // now set first eigenvalue to trace(W) - trace(H) 01391 double traceW=0.0; 01392 for (int i=0; i<_layer_count; ++i) traceW+=gsl_matrix_get(_gsl_workspace_W, i, i); 01393 double traceH=0.0; 01394 for (int i=0; i<_layer_count; ++i) traceH+=gsl_matrix_get(_gsl_workspace_H, i, i); 01395 gsl_vector_set(_gsl_eigenvalue_vector,0,traceW-traceH); 01396 01397 /* % Figure out which eigenvalue has gone AWOL */ 01398 01399 /* remove column associated with dodgy eigenvector (_gsl_vector_min) */ 01400 /* Replace the first column with the vector of means */ 01401 /* and the move other columns down */ 01402 gsl_matrix * _gsl_score_matrix_copy=gsl_matrix_alloc(_layer_count,_layer_count); 01403 gsl_matrix_memcpy(_gsl_score_matrix_copy, _gsl_score_matrix); 01404 01405 for (int i=0; i<_layer_count; ++i) 01406 for (int j=0; j<_layer_count; ++j) 01407 { 01408 if (j==0) 01409 { 01410 gsl_matrix_set(_gsl_score_matrix,i,j,gsl_vector_get(_gsl_avg_vector,i)); 01411 } 01412 else if (_gsl_vector_min>=j) 01413 { 01414 gsl_matrix_set(_gsl_score_matrix,i,j, 01415 gsl_matrix_get(_gsl_score_matrix_copy,i,j-1)); 01416 } 01417 } 01418 01419 gsl_matrix_free(_gsl_score_matrix_copy); 01420 01421 double score2norm; 01422 /* Norm the columns of the score_matrix 01423 - that is divide each value by the 2-norm of its column */ 01424 for (int i=0; i<_layer_count; ++i) 01425 { 01426 // work out the 2-norm for this column = sqrt(sum(x^2)) 01427 double unorm=0.0; 01428 for (int j=0; j<_layer_count; ++j) 01429 unorm+=pow(gsl_matrix_get(_gsl_score_matrix,j,i),2); 01430 unorm=pow(unorm,0.5); 01431 01432 //Log::instance()->info( "col i: %i norm: %6.2f\n", i, unorm); 01433 01434 // and apply this normalisation to the matrix 01435 for (int j=0; j<_layer_count; ++j) 01436 { 01437 score2norm = gsl_matrix_get(_gsl_score_matrix,j,i); 01438 //Log::instance()->info( "score 2 norm: %6.2f\n", score2norm); 01439 score2norm = score2norm/unorm; 01440 //Log::instance()->info( "score 2 normed: %6.2f\n", score2norm); 01441 gsl_matrix_set(_gsl_score_matrix,j,i,score2norm); 01442 } 01443 } 01444 01445 if (_verboseDebug) 01446 { 01447 displayVector(_gsl_eigenvalue_vector, "Eigenvalues", true); 01448 displayMatrix(_gsl_eigenvector_matrix, "Eigenvectors", true); 01449 displayMatrix(_gsl_score_matrix, "Score Matrix", true); 01450 } 01451 01452 /* marginality = geometric mean of means 01453 marginality = sqrt(sum(m.^2)); 01454 marginality = marginality / 1.96 */ 01455 01456 _marginality=0.0; 01457 for (int i=0; i<_layer_count; ++i) 01458 _marginality+=pow(gsl_vector_get(_gsl_avg_vector,i),2); 01459 _marginality=pow(_marginality,0.5)/1.96; 01460 Log::instance()->info( "Marginality: %6.2f\n", _marginality ); 01461 01462 /* Calculate global specialization 01463 specialization = sqrt(sum(eigenvalues)/length(m)) */ 01464 _specialisation=pow((gsl_blas_dasum(_gsl_eigenvalue_vector)/_layer_count),0.5); 01465 Log::instance()->info( "Specialisation: %6.2f\n", _specialisation ); 01466 01467 01468 /* work out factor weights = eigenvalues/sum(eigenvalues) 01469 * dicarded components are given a zero weighting */ 01470 _gsl_factor_weights_all_components = gsl_vector_alloc(_layer_count); 01471 for (int i=0; i<_layer_count; ++i) 01472 { 01473 gsl_vector_set(_gsl_factor_weights_all_components,i, 01474 fabs(gsl_vector_get(_gsl_eigenvalue_vector,i))); 01475 } 01476 gsl_vector_scale(_gsl_factor_weights_all_components, 1.0/gsl_blas_dasum(_gsl_factor_weights_all_components)); 01477 01478 //After the model is generated, we can discard unwanted components! 01479 _retained_components_count=discardComponents(); 01480 01481 /* work out factor weights = eigenvalues/sum(eigenvalues) 01482 * dicarded components are given a zero weighting */ 01483 _gsl_factor_weights = gsl_vector_alloc(_layer_count); 01484 gsl_vector_set_zero(_gsl_factor_weights); 01485 for (int i=0; i<_retained_components_count; ++i) 01486 { 01487 gsl_vector_set(_gsl_factor_weights,i, 01488 fabs(gsl_vector_get(_gsl_eigenvalue_vector,i))); 01489 } 01490 gsl_vector_scale(_gsl_factor_weights, 1.0/gsl_blas_dasum(_gsl_factor_weights)); 01491 01492 // print factor weights 01493 for (int i=0; i<_retained_components_count; ++i) 01494 { 01495 if (i==0) 01496 { 01497 Log::instance()->info("Marginality (factor 0) weight: %.2f, variation explained: %.2f\n", gsl_vector_get(_gsl_factor_weights,i), gsl_vector_get(_gsl_factor_weights_all_components,i)); 01498 } 01499 else 01500 { 01501 Log::instance()->info("Specialisation factor %i weight: %.2f, variation explained: %.2f\n", i, gsl_vector_get(_gsl_factor_weights,i), gsl_vector_get(_gsl_factor_weights_all_components,i)); 01502 } 01503 } 01504 01505 /* Display the variable loadings of each factor (score matrix) */ 01506 if (_displayLoadings) 01507 { 01508 displayLoadings(_gsl_score_matrix, _retained_components_count); 01509 } 01510 01511 /* work out geometric mean of the distance between each species locality and 01512 every other locality based on the factored data */ 01513 _gsl_geomean_vector = gsl_vector_alloc(_localityCount); 01514 _gsl_environment_factor_matrix = gsl_matrix_alloc(_localityCount,_layer_count); 01515 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 01516 1.0, _gsl_environment_matrix, _gsl_score_matrix, 01517 0.0, _gsl_environment_factor_matrix); 01518 01519 gsl_vector_view _gsl_geomean_workspace; 01520 01521 double _tmp_geomean; 01522 01523 //Log::instance()->info( "Calculating geometric means\n" ); 01524 //displayMatrix(_gsl_environment_factor_matrix, "_gsl_environment_factor_matrix", true); 01525 01526 // loop through the localities and calculate the geomean 01527 for (int i=0; i<_localityCount; ++i) 01528 { 01529 _gsl_geomean_workspace=gsl_matrix_row(_gsl_environment_factor_matrix,i); 01530 _tmp_geomean=getGeomean(&_gsl_geomean_workspace.vector); 01531 gsl_vector_set(_gsl_geomean_vector, i, _tmp_geomean); 01532 } 01533 01534 // finally sort the geomeans to speed up processing later 01535 gsl_sort_vector(_gsl_geomean_vector); 01536 01537 Log::instance()->info( "ENFA Model Generation Completed\n" ); 01538 01539 return true; 01540 01541 } 01542 01543 01544 01545 /****************************************************************/ 01546 /****************** configuration *******************************/ 01547 void 01548 Enfa::_getConfiguration( ConfigurationPtr& config ) const 01549 { 01550 if (!_done ) 01551 return; 01552 01553 ConfigurationPtr model_config( new ConfigurationImpl("Enfa") ); 01554 config->addSubsection( model_config ); 01555 01556 // _marginality 01557 model_config->addNameValue( "Marginality", _marginality ); 01558 01559 // _specialisation 01560 model_config->addNameValue( "Specialisation", _specialisation ); 01561 01562 // _retained_components_count 01563 model_config->addNameValue( "RetainedComponents", _retained_components_count ); 01564 01565 double *values = new double[_layer_count]; 01566 01567 // _gsl_avg_background_vector 01568 for (int i=0; i < _layer_count; ++i) 01569 values[i] = gsl_vector_get(_gsl_avg_background_vector, i); 01570 01571 model_config->addNameValue( "AvgBackgroundVector", values, _layer_count ); 01572 01573 // _gsl_stddev_background_vector 01574 for (int i=0; i < _layer_count; ++i) 01575 values[i] = gsl_vector_get(_gsl_stddev_background_vector, i); 01576 01577 model_config->addNameValue( "StddevBackgroundVector", values, _layer_count ); 01578 01579 // _gsl_avg_vector 01580 for (int i=0; i < _layer_count; ++i) 01581 values[i] = gsl_vector_get(_gsl_avg_vector, i); 01582 01583 model_config->addNameValue( "AvgVector", values, _layer_count ); 01584 01585 // _gsl_stddev_vector 01586 for (int i=0; i < _layer_count; ++i) 01587 values[i] = gsl_vector_get(_gsl_stddev_vector, i); 01588 01589 model_config->addNameValue( "StddevVector", values, _layer_count ); 01590 01591 // _gsl_eigenvalue_vector 01592 for (int i=0; i < _layer_count; ++i) 01593 values[i] = gsl_vector_get(_gsl_eigenvalue_vector, i); 01594 01595 model_config->addNameValue( "EigenvalueVector", values, _layer_count ); 01596 01597 // _gsl_factor_weights 01598 for (int i=0; i < _layer_count; ++i) 01599 values[i] = gsl_vector_get(_gsl_factor_weights, i); 01600 01601 model_config->addNameValue( "FactorWeights", values, _layer_count ); 01602 01603 delete[] values; 01604 01605 // _gsl_eigenvector_matrix 01606 int num_cells = _layer_count * _layer_count; 01607 01608 double *flat_eigenvector_matrix = new double[num_cells]; 01609 01610 int cnt = 0; 01611 01612 for (int i=0; i < _layer_count; ++i) 01613 for (int j=0; j < _layer_count; ++j, ++cnt) 01614 flat_eigenvector_matrix[cnt] = gsl_matrix_get( _gsl_eigenvector_matrix, i, j ); 01615 01616 model_config->addNameValue( "EigenvectorMatrix", flat_eigenvector_matrix, num_cells ); 01617 01618 delete[] flat_eigenvector_matrix; 01619 01620 // _gsl_score_matrix 01621 num_cells = _layer_count * _layer_count; 01622 01623 double *flat_score_matrix = new double[num_cells]; 01624 01625 cnt = 0; 01626 01627 for (int i=0; i < _layer_count; ++i) 01628 for (int j=0; j < _layer_count; ++j, ++cnt) 01629 flat_score_matrix[cnt] = gsl_matrix_get( _gsl_score_matrix, i, j ); 01630 01631 model_config->addNameValue( "ScoreMatrix", flat_score_matrix, num_cells ); 01632 01633 delete[] flat_score_matrix; 01634 01635 // _gsl_environment_factor_matrix 01636 num_cells = _localityCount * _layer_count; 01637 01638 double *flat_environment_factor_matrix = new double[num_cells]; 01639 01640 cnt = 0; 01641 01642 for (int i=0; i < _localityCount; ++i) 01643 for (int j=0; j < _layer_count; ++j, ++cnt) 01644 flat_environment_factor_matrix[cnt] = gsl_matrix_get( _gsl_environment_factor_matrix, i, j ); 01645 01646 model_config->addNameValue( "EnvironmentFactorMatrix", flat_environment_factor_matrix, num_cells ); 01647 01648 delete[] flat_environment_factor_matrix; 01649 01650 // _gsl_geomean_vector 01651 values = new double[_localityCount]; 01652 01653 for (int i=0; i < _localityCount; ++i) 01654 values[i] = gsl_vector_get(_gsl_geomean_vector, i); 01655 01656 model_config->addNameValue( "LocalityGeomeans", values, _localityCount ); 01657 01658 delete[] values; 01659 01660 } 01661 01662 void 01663 Enfa::_setConfiguration( const ConstConfigurationPtr& config ) 01664 { 01665 ConstConfigurationPtr model_config = config->getSubsection( "Enfa",false ); 01666 01667 if (!model_config) 01668 return; 01669 01670 // _retained_components_count 01671 _retained_components_count = model_config->getAttributeAsInt( "RetainedComponents", 0 ); 01672 01673 // _marginality 01674 _marginality = model_config->getAttributeAsDouble( "Marginality", 0.0 ); 01675 01676 // _specialisation 01677 _specialisation = model_config->getAttributeAsDouble( "Specialisation", 0.0 ); 01678 01679 // _gsl_avg_vector 01680 std::vector<double> stl_vector = model_config->getAttributeAsVecDouble( "AvgVector" ); 01681 01682 _layer_count = stl_vector.size(); 01683 01684 _gsl_avg_vector = gsl_vector_alloc( _layer_count ); 01685 01686 for (int i=0; i < _layer_count; ++i) 01687 gsl_vector_set( _gsl_avg_vector, i, stl_vector[i] ); 01688 01689 // _gsl_avg_background_vector 01690 stl_vector = model_config->getAttributeAsVecDouble( "AvgBackgroundVector" ); 01691 01692 _layer_count = stl_vector.size(); 01693 01694 _gsl_avg_background_vector = gsl_vector_alloc( _layer_count ); 01695 01696 for (int i=0; i < _layer_count; ++i) 01697 gsl_vector_set( _gsl_avg_background_vector, i, stl_vector[i] ); 01698 01699 // _gsl_stddev_vector 01700 stl_vector = model_config->getAttributeAsVecDouble( "StddevVector" ); 01701 01702 _gsl_stddev_vector = gsl_vector_alloc( _layer_count ); 01703 01704 for (int i=0; i < _layer_count; ++i) 01705 gsl_vector_set( _gsl_stddev_vector, i, stl_vector[i] ); 01706 01707 // _gsl_stddev_background_vector 01708 stl_vector = model_config->getAttributeAsVecDouble( "StddevBackgroundVector" ); 01709 01710 _gsl_stddev_background_vector = gsl_vector_alloc( _layer_count ); 01711 01712 for (int i=0; i < _layer_count; ++i) 01713 gsl_vector_set( _gsl_stddev_background_vector, i, stl_vector[i] ); 01714 01715 // _gsl_eigenvalue_vector 01716 stl_vector = model_config->getAttributeAsVecDouble( "EigenvalueVector" ); 01717 01718 _layer_count = stl_vector.size(); 01719 01720 _gsl_eigenvalue_vector = gsl_vector_alloc( _layer_count ); 01721 01722 for (int i=0; i < _layer_count; ++i) 01723 gsl_vector_set( _gsl_eigenvalue_vector, i, stl_vector[i] ); 01724 01725 // _gsl_factor_weights 01726 stl_vector = model_config->getAttributeAsVecDouble( "FactorWeights" ); 01727 01728 _layer_count = stl_vector.size(); 01729 01730 _gsl_factor_weights = gsl_vector_alloc( _layer_count ); 01731 01732 for (int i=0; i < _layer_count; ++i) 01733 gsl_vector_set( _gsl_factor_weights, i, stl_vector[i] ); 01734 01735 // _gsl_eigenvector_matrix 01736 stl_vector = model_config->getAttributeAsVecDouble( "EigenvectorMatrix" ); 01737 01738 _gsl_eigenvector_matrix = gsl_matrix_alloc( _layer_count, _layer_count ); 01739 01740 int cnt = 0; 01741 01742 for (int i=0; i < _layer_count; ++i) 01743 for (int j=0; j < _layer_count; ++j, ++cnt) 01744 gsl_matrix_set( _gsl_eigenvector_matrix, i, j, stl_vector[cnt] ); 01745 01746 01747 // _gsl_score_matrix 01748 stl_vector = model_config->getAttributeAsVecDouble( "ScoreMatrix" ); 01749 01750 _gsl_score_matrix = gsl_matrix_alloc( _layer_count, _layer_count ); 01751 01752 cnt = 0; 01753 01754 for (int i=0; i < _layer_count; ++i) 01755 for (int j=0; j < _layer_count; ++j, ++cnt) 01756 gsl_matrix_set( _gsl_score_matrix, i, j, stl_vector[cnt] ); 01757 01758 01759 // _gsl_environment_factor_matrix 01760 stl_vector = model_config->getAttributeAsVecDouble( "EnvironmentFactorMatrix" ); 01761 01762 _localityCount = stl_vector.size()/_layer_count; 01763 01764 _gsl_environment_factor_matrix = gsl_matrix_alloc( _localityCount, _layer_count ); 01765 01766 cnt = 0; 01767 01768 for (int i=0; i < _localityCount; ++i) 01769 for (int j=0; j < _layer_count; ++j, ++cnt) 01770 gsl_matrix_set( _gsl_environment_factor_matrix, i, j, stl_vector[cnt] ); 01771 01772 01773 // _gsl_geomean_vector 01774 stl_vector = model_config->getAttributeAsVecDouble( "LocalityGeomeans" ); 01775 01776 _gsl_geomean_vector = gsl_vector_alloc( _localityCount ); 01777 01778 for (int i=0; i < _localityCount; ++i) 01779 gsl_vector_set( _gsl_geomean_vector, i, stl_vector[i] ); 01780 01781 _done = true; 01782 01783 }