openModeller  Version 1.5.0
enfa.cpp
Go to the documentation of this file.
1 //
2 // Enfa
3 //
4 // Description:
5 //
6 //
7 // Author: CRIA <chris.yesson@ioz.ac.uk>, (C) 2009
8 //
9 // Copyright: See COPYING file that comes with this distribution
10 //
11 //
13 
14 #include <string.h>
15 #include <cassert>
16 #include <math.h>
17 #include <gsl/gsl_statistics.h>
18 #include <gsl/gsl_math.h>
19 #include <gsl/gsl_permutation.h>
20 #include <gsl/gsl_matrix.h>
21 #include <gsl/gsl_eigen.h>
22 #include <gsl/gsl_blas.h>
23 #include <gsl/gsl_linalg.h>
24 #include <gsl/gsl_errno.h>
25 #include <gsl/gsl_sort_vector.h>
26 
27 #include "enfa.hh"
28 
29 
30 /****************************************************************/
31 /********************** Algorithm's Metadata ********************/
32 
33 #define NUM_PARAM 7
34 #define BACKGROUND_ID "NumberOfBackgroundPoints"
35 #define NUM_RETRIES "NumberOfRetries"
36 #define DISCARD_METHOD "DiscardMethod"
37 #define RETAIN_COMPONENTS "RetainComponents"
38 #define RETAIN_VARIATION "RetainVariation"
39 #define DISPLAY_LOADINGS "DisplayLoadings"
40 #define VERBOSE_DEBUG "VerboseDebug"
41 
43 
44  {
45  BACKGROUND_ID, // Id.
46  "Number of background sample points", // Name.
47  Integer, // Type.
48  "Number of background points to be sampled when estimating the mean and standard deviation.", // Overview
49  "The ENFA algorithm compares the species presence data with the background environment. This requires the calculation of the mean, standard deviation and covariances of each environmental layer. This could be prohibitively slow and expensive for large datasets, so estimate these values by randomly sampling X points from the background data.", // Description.
50  1, // Not zero if the parameter has lower limit.
51  10, // Parameter's lower limit.
52  0, // Not zero if the parameter has upper limit.
53  0, // Parameter's upper limit.
54  "10000" // Parameter's typical (default) value.
55  },
56  {
57  NUM_RETRIES, // Id.
58  "Number of retries of model", // Name.
59  Integer, // Type.
60  "Number of attempted retries in the case that the model generation fails.", // Overview
61  "The algorithm requires the inversion of several matrices, but sometimes the matrix to invert is singular and so the algorithm fails. This seems to occur when the background data is undersampled or not representative. Often retrying the model generation (i.e. resampling the background data) makes this problem go away.", // Description.
62  1, // Not zero if the parameter has lower limit.
63  1, // Parameter's lower limit.
64  0, // Not zero if the parameter has upper limit.
65  100, // Parameter's upper limit.
66  "5" // Parameter's typical (default) value.
67  },
68  {
69  DISCARD_METHOD, // Id.
70  "Method for discarding components", // Name.
71  Integer, // Type.
72  "Method for discarding components: 0 - Fixed number (set by RETAIN_COMPONENTS), 1 - Minimum components explaining a fixed variation (set by RETAIN_VARIATION), 2 - Broken stick method", // Overview
73  " 0 - Retain a fixed number of components defined by the variable RETAIN_COMPONENTS\n 1 - Retain the top N components that cumulatively explain the level of variation defined by the RETAIN_VARIATION variable\n 2 - Compare the observed explanation of variation to the broken stick distribution retaining those components explaining higher levels of variation", // Description.
74  1, // Not zero if the parameter has lower limit.
75  0, // Parameter's lower limit.
76  1, // Not zero if the parameter has upper limit.
77  2, // Parameter's upper limit.
78  "2" // Parameter's typical (default) value.
79  },
80  {
81  RETAIN_COMPONENTS, // Id.
82  "Number of components to retain", // Name.
83  Integer, // Type.
84  "Specify the number of components to retain (only for DISCARD_METHOD=0)", // Overview
85  "If the Discard_method=0, then this variable is used to determine the number of components to retain.", // Description.
86  1, // Not zero if the parameter has lower limit.
87  1, // Parameter's lower limit.
88  0, // Not zero if the parameter has upper limit.
89  100, // Parameter's upper limit.
90  "2" // Parameter's typical (default) value.
91  },
92  {
93  RETAIN_VARIATION, // Id.
94  "Percent varition for component retention", // Name.
95  Integer, // Type.
96  "Specify the amount of variation that the retained components should explain (only for DISCARD_METHOD=1)", // Overview
97  "If the Discard_method=1, then this variable is used to determine the number of components to retain, by taking those components that cumulatively account for at least this much variation.", // Description.
98  1, // Not zero if the parameter has lower limit.
99  0.5, // Parameter's lower limit.
100  1, // Not zero if the parameter has upper limit.
101  1.0, // Parameter's upper limit.
102  "0.75" // Parameter's typical (default) value.
103  },
104  {
105  DISPLAY_LOADINGS, // Id.
106  "Display variable loadings for each factor", // Name.
107  Integer, // Type.
108  "Output a comma separated matrix of variable loadings for each retained factor", // Overview
109  "Set to 1 to display the matrix. Var=variable, Marg=Marginality (factor 0), Sp-1=Specialisation factor 1, etc. Variables are numbered in the order they are input in the request file.", // Description.
110  1, // Not zero if the parameter has lower limit.
111  0, // Parameter's lower limit.
112  1, // Not zero if the parameter has upper limit.
113  1, // Parameter's upper limit.
114  "0" // Parameter's typical (default) value.
115  },
116  {
117  VERBOSE_DEBUG, // Id.
118  "Verbose printing for debugging", // Name.
119  Integer, // Type.
120  "Print lots of details", // Overview
121  "", // Description.
122  1, // Not zero if the parameter has lower limit.
123  0, // Parameter's lower limit.
124  1, // Not zero if the parameter has upper limit.
125  1, // Parameter's upper limit.
126  "0" // Parameter's typical (default) value.
127  },
128 };
129 
130 /****************************************************************/
131 /****************************** Enfa *****************************/
132 static AlgMetadata metadata = { // General metadata
133  "ENFA", // Id
134  "ENFA (Ecological-Niche Factor Analysis)", // Name
135  "0.1.3", // Version
136  "Algorithm based on presence only data using a modified principal components analysis.", // Overview
137  "Ecological-Niche Factor Analysis (Hirzel et al, 2002) uses a modified principal components analysis to develop a model based on presence only data. The observed environment is compared to the background data of the study area (note that absence points in the occurrence file are treated as background data). The analysis produces factors similar to a PCA. The first factor is termed the 'marginality' of the species, marginality is defined as the ecological distance between the species optimum and the mean habitat within the background data. Other factors are termed the 'specialization', and are defined as the ratio of the ecological variance in mean habitat to that observed for the target species. Model projection uses the geomeans method of Hirzel & Arlettaz (2003)", // Description
138  "Hirzel, A.H.; Hausser, J.; Chessel, D. & Perrin, N.", // Algorithm author
139  "Hirzel, A.H.; Hausser, J.; Chessel, D. & Perrin, N. Ecological-niche factor analysis: How to compute habitat-suitability maps without absence data? Ecology, 2002, 83, 2027-2036\nHirzel, A.H & Arlettaz, R. Modeling habitat suitability for complex species distributions by environmental distance geometric mean Environmental Management, 2003, 32, 614-623\n", // Bibliography
140  "Chris Yesson", // Code author
141  "chris.yesson [at] ioz.ac.uk", // Code author's contact
142  0, // Does not accept categorical data
143  0, // Does not need (pseudo)absence points
144  NUM_PARAM, parameters // Algorithm's parameters
145 };
146 
147 
148 
149 /****************************************************************/
150 /****************** Algorithm's factory function ****************/
151 
152 OM_ALG_DLL_EXPORT
155 {
156  return new Enfa();
157 }
158 
159 OM_ALG_DLL_EXPORT
160 AlgMetadata const *
162 {
163  return &metadata;
164 }
165 
166 
171 Enfa::Enfa() :
172  AlgorithmImpl( &metadata )
173 {
174  _initialized = 0;
175  _gsl_environment_matrix = 0;
176  _gsl_environment_factor_matrix = 0;
177  _gsl_background_matrix = 0;
178  _gsl_covariance_matrix = 0;
179  _gsl_covariance_background_matrix = 0;
180  _gsl_avg_vector = 0;
181  _gsl_stddev_vector = 0;
182  _gsl_avg_background_vector = 0;
183  _gsl_stddev_background_vector = 0;
184  _gsl_eigenvalue_vector = 0;
185  _gsl_eigenvector_matrix = 0;
186  _gsl_score_matrix = 0;
187  _gsl_covariance_matrix_root_inverse = 0;
188  _gsl_workspace_H = 0;
189  _gsl_workspace_W = 0;
190  _gsl_workspace_y = 0;
191  _gsl_workspace_z = 0;
192  _gsl_factor_weights_all_components = 0;
193  _gsl_factor_weights = 0;
194  _gsl_geomean_vector = 0;
195 }
196 
197 
199 Enfa::~Enfa()
200 {
201  if ( _initialized )
202  {
203  if (_gsl_background_matrix)
204  {
205  gsl_matrix_free (_gsl_background_matrix);
206  }
207  if (_gsl_covariance_background_matrix)
208  {
209  gsl_matrix_free (_gsl_covariance_background_matrix);
210  }
211  if (_gsl_covariance_matrix)
212  {
213  gsl_matrix_free (_gsl_covariance_matrix);
214  }
215  if (_gsl_covariance_matrix_root_inverse)
216  {
217  gsl_matrix_free (_gsl_covariance_matrix_root_inverse);
218  }
219  if (_gsl_eigenvector_matrix)
220  {
221  gsl_matrix_free (_gsl_eigenvector_matrix);
222  }
223  if (_gsl_environment_factor_matrix)
224  {
225  gsl_matrix_free (_gsl_environment_factor_matrix);
226  }
227  if (_gsl_environment_matrix)
228  {
229  gsl_matrix_free (_gsl_environment_matrix);
230  }
231  if (_gsl_score_matrix)
232  {
233  gsl_matrix_free (_gsl_score_matrix);
234  }
235  if (_gsl_workspace_H)
236  {
237  gsl_matrix_free (_gsl_workspace_H);
238  }
239  if (_gsl_workspace_W)
240  {
241  gsl_matrix_free (_gsl_workspace_W);
242  }
243  if (_gsl_workspace_y)
244  {
245  gsl_matrix_free (_gsl_workspace_y);
246  }
247  if (_gsl_avg_vector)
248  {
249  gsl_vector_free (_gsl_avg_vector);
250  }
251  if (_gsl_avg_background_vector)
252  {
253  gsl_vector_free (_gsl_avg_background_vector);
254  }
255  if (_gsl_eigenvalue_vector)
256  {
257  gsl_vector_free (_gsl_eigenvalue_vector);
258  }
259  if (_gsl_stddev_vector)
260  {
261  gsl_vector_free (_gsl_stddev_vector);
262  }
263  if (_gsl_stddev_background_vector)
264  {
265  gsl_vector_free (_gsl_stddev_background_vector);
266  }
267  if (_gsl_workspace_z)
268  {
269  gsl_vector_free (_gsl_workspace_z);
270  }
271  if (_gsl_factor_weights)
272  {
273  gsl_vector_free (_gsl_factor_weights);
274  }
275  if (_gsl_factor_weights_all_components)
276  {
277  gsl_vector_free (_gsl_factor_weights_all_components);
278  }
279  if (_gsl_geomean_vector)
280  {
281  gsl_vector_free (_gsl_geomean_vector);
282  }
283  }
284 }
285 
286 
287 //
288 // Methods used to build the model
289 //
290 
291 
292 
293 
301 int Enfa::initialize()
302 {
303  _initialized = 1;
304 
305  Log::instance()->info( "Base ENFA class initializing\n" );
306 
307  //set the class member that holds the number of environmental variables
308  //we subtract 1 because the first column contains the specimen count
309  _layer_count = _samp->numIndependent();
310  //set the class member that holds the number of occurences
311  _localityCount = _samp->numPresence();
312  //Log::instance()->info( "Checking more than 1 samples exist\n" );
313  if (_localityCount<2)
314  {
315  Log::instance()->warn( "ENFA needs at least two occurrence points..aborting...\n" );
316  _initialized = 0;
317  return 0;
318  }
319 
320  // _backgroundCount = _backgroundSamp->numOccurrences();
321  //if ( _samp->numAbsence() )
322  if ( _samp->numAbsence() )
323  {
324  _backgroundProvided=1;
325  _backgroundCount = _samp->numAbsence();
326  Log::instance()->info( "Using background data provided (%i records marked as absences in the occurrence file)\n",_backgroundCount);
327 
328  }
329  else
330  {
331  getParameter( BACKGROUND_ID, &_backgroundCount );
332  _backgroundProvided=0;
333  }
334 
335  //use the pseudoabsence generator to do this
336  getParameter( NUM_RETRIES, &_numRetries );
337  getParameter( DISCARD_METHOD, &_discardMethod);
338  getParameter( RETAIN_COMPONENTS, &_retainComponents);
339  getParameter( RETAIN_VARIATION, &_retainVariation);
340  getParameter( DISPLAY_LOADINGS, &_displayLoadings);
341  getParameter( VERBOSE_DEBUG, &_verboseDebug);
342 
343  /* sometimes our random background samples produce singular matrices
344  so catch this exception and restart the model */
345 
346  _retryCount=0;
347  bool myFlag=false;
348 
349  while (_retryCount<_numRetries && !myFlag)
350  {
351 
352  _retryCount+=1;
353  try
354  {
355 
356  //convert the sampler to a matrix and store in the local gsl_matrix
357  //Log::instance()->info( "Converting samples to GSL_Matrix\n" );
358  if (!SamplerToMatrix())
359  {
360  Log::instance()->warn( "All occurences are outside the masked area!\n" );
361  _initialized = 0;
362  return 0;
363  }
364 
365  if (!BackgroundToMatrix())
366  {
367  Log::instance()->warn( "Failed to sample background data!\n" );
368  _initialized = 0;
369  return 0;
370  }
371 
372  myFlag = enfa1();
373 
374  }
375  catch (InverseFailedException& exception)
376  {
377  UNUSED(exception);
378  Log::instance()->warn( "Model failed, retry number %i\n", _retryCount );
379  myFlag=false;
380  }
381 
382  }
383 
384  if (! myFlag)
385  {
386  Log::instance()->warn( "Model initialization failed!\n" );
387  }
388 
389  return myFlag;
390 }
391 
396 int Enfa::SamplerToMatrix()
397 {
398  if (_localityCount < 1)
399  {
400  return 0;
401  }
402 
403  OccurrencesImpl::const_iterator pit = _samp->getPresences()->begin();
404  OccurrencesImpl::const_iterator fin = _samp->getPresences()->end();
405 
406  // Allocate the gsl matrix to store environment data at each locality
407  _gsl_environment_matrix = gsl_matrix_alloc (_localityCount, _layer_count);
408 
409  // now populate the gsl matrix from the sample data
410  for (int i=0; pit != fin; ++pit, ++i)
411  {
412  for (int j=0;j<_layer_count;j++)
413  {
414  //we add one to j in order to omit the specimen count column
415  double myCellValue = (double)(*pit)->environment()[j];
416  gsl_matrix_set (_gsl_environment_matrix,i,j,myCellValue);
417  }
418  }
419 
420  //Log::instance()->info( "Enfa::SampleToMatrix: x: %i y: %i\n",_layer_count,_localityCount );
421  return 1;
422 }
423 
424 // sample background data and store it in a gsl_matrix
425 int Enfa::BackgroundToMatrix()
426 {
427 
428  //Log::instance()->info("Enfa:BackgroundToMatrix:Generating background samples.\n" );
429 
430  // try getting all background samples at once using the absence generator
431  // this enables us to ensure geographic uniqueness
432  // question: what happens when num-background > number of cells in env layer?
433  // allow user to provide background points using the absence parser
434  OccurrencesPtr _ocbg;
437 
438  if (_backgroundProvided==1)
439  {
440  _ocbg = _samp->getAbsences();
441  }
442  else
443  {
444  _ocbg = _samp->getPseudoAbsences(_backgroundCount, 0, 1, false, false);
445  }
446 
447  pit = _ocbg->begin();
448  fin = _ocbg->end();
449 
450  // Allocate the gsl matrix to store environment data at each background point
451  _gsl_background_matrix = gsl_matrix_alloc (_backgroundCount, _layer_count);
452  // now populate the gsl matrix from the sample data
453  for (int i=0; pit != fin; ++pit, ++i)
454  {
455  for (int j=0;j<_layer_count;j++)
456  {
457  //we add one to j in order to omit the specimen count column
458  double myCellValue = (double)(*pit)->environment()[j];
459  gsl_matrix_set (_gsl_background_matrix,i,j,myCellValue);
460  }
461  }
462 
463  return 1;
464 }
465 
466 
467 
469 int Enfa::calculateMeanAndSd(gsl_matrix * theMatrix,
470  gsl_vector * theMeanVector,
471  gsl_vector * theStdDevVector)
472 {
473 #ifndef WIN32
474  assert (theMatrix != 0);
475  assert (theMeanVector !=0);
476  assert (theStdDevVector !=0);
477 #endif
478  //Initialise the vector to hold the mean of each column
479  gsl_vector_set_zero(theMeanVector);
480 
481  //Initialise the vector to hold the stddev of each column
482  gsl_vector_set_zero(theStdDevVector);
483 
484  //calculate the mean and stddev of each column
485  for (int j = 0; j < _layer_count; j++)
486  {
487  //get the current column from the array as a vector
488  gsl_vector_view myColumn = gsl_matrix_column (theMatrix, j);
489  //calculate the average for the column ...
490  double myAverage = gsl_stats_mean (myColumn.vector.data, myColumn.vector.stride, myColumn.vector.size);
491  // ...and assign it to the jth element in the column means vector
492  gsl_vector_set (theMeanVector,j,myAverage);
493  //calculate the stddev for the column and ...
494  double myStdDev = gsl_stats_sd (myColumn.vector.data, myColumn.vector.stride, myColumn.vector.size);
495  // ...and assign it to the jth element in the column stddev vector
496  gsl_vector_set (theStdDevVector,j,myStdDev);
497  }
498 
499  return 0;
500 }
501 
502 int Enfa::center(gsl_matrix * theMatrix,
503  int spCount)
504 {
505 #ifndef WIN32
506  assert (theMatrix != 0);
507 #endif
508  //
509  //Subtract the column mean from every value in each column
510  //Divide each resultant column value by the stddev for that column
511  //Note that we are walking the matrix column wise
512  //
513  //Log::instance()->info( "Centering data\n" );
514  //for (int j=0;j<_layer_count;j++)
515  int msize=theMatrix->size1;
516  for (int j=0;j<_layer_count;j++)
517  {
518  //get the stddev and mean for this column
519  double myAverage = gsl_vector_get (_gsl_avg_background_vector,j);
520  double myStdDev = gsl_vector_get (_gsl_stddev_background_vector,j);
521 
522  for (int i=0;i<msize;++i)
523  {
524  double myDouble = gsl_matrix_get (theMatrix,i,j);
525  if (myStdDev > 0)
526  {
527  myDouble = (myDouble-myAverage)/myStdDev;
528  }
529  else
530  {
531  myDouble=0.0;
532  }
533  //update the gsl_matrix with the new value
534  gsl_matrix_set(theMatrix,i,j,myDouble);
535  }
536  }
537 
538  return 0;
539 }
540 
541 
546 int Enfa::iterate()
547 {
548  _done=1;
549  return 1;
550 }
551 
552 
559 int Enfa::done() const
560 {
561  return _done;
562 }
563 
564 
565 //
566 // Methods used to project the model
567 //
568 
569 
574 Scalar Enfa::getValue( const Sample& x ) const
575 {
576  int ii, cellcount;
577  float myFloat;
578 
579  //Log::instance()->info( "Enfa:getValue:entering getvalue\n" );
580 
581  //first thing we do is convert the oM primitive env value array
582  // to a gsl vector so we can use matrix multplication with it
583  gsl_vector * tmp_raw_gsl_vector = gsl_vector_alloc (_layer_count);
584  // center data using the avg and stdev
585  gsl_vector * tmp_centered_gsl_vector = gsl_vector_alloc(_layer_count);
586  double m1, m2, m3, m4;
587 
588  for (ii=0;ii<_layer_count;++ii)
589  {
590  myFloat = static_cast<float>(x[ii]);
591  gsl_vector_set (tmp_raw_gsl_vector,ii,myFloat);
592 
593  m1=myFloat;
594  m2=gsl_vector_get(_gsl_avg_background_vector,ii);
595  m3=gsl_vector_get(_gsl_stddev_background_vector,ii);
596  m4=(m1-m2)/m3;
597  //Log::instance()->info( "getValue: m1: %6.2f, m2: %6.2f, m3: %6.2f, m4: %6.2f, \n", m1, m2, m3, m4 );
598  gsl_vector_set(tmp_centered_gsl_vector, ii, m4);
599  }
600 
601  //if (_verboseDebug)
602  //{
603  //displayVector(tmp_raw_gsl_vector, "getvalue: tmp_raw_gsl_vector", true);
604  //displayVector(tmp_centered_gsl_vector, "getvalue: tmp_centered_gsl_vector", true);
605  //}
606 
607  gsl_vector_free(tmp_raw_gsl_vector);
608 
609  //vector for factored data
610  gsl_matrix * tmp_centered_gsl_matrix = gsl_matrix_alloc(1,_layer_count);
611  for (ii=0;ii<_layer_count;++ii)
612  gsl_matrix_set(tmp_centered_gsl_matrix,0,ii,gsl_vector_get(tmp_centered_gsl_vector,ii));
613 
614  gsl_vector_free(tmp_centered_gsl_vector);
615 
616  gsl_matrix * tmp_factored_gsl_matrix = gsl_matrix_alloc(1,_layer_count);
617 
618  // multiply centered data by score matrix to get factored data
619  //gsl_blas_dgemv (CblasTrans, 1.0, _gsl_score_matrix, tmp_centered_gsl_vector, 0.0, tmp_factored_gsl_vector);
620  //gsl_blas_dgemv (CblasNoTrans, 1.0, _gsl_score_matrix, tmp_centered_gsl_vector, 0.0, tmp_factored_gsl_vector);
621 
622  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans,
623  1.0, tmp_centered_gsl_matrix, _gsl_score_matrix,
624  0.0, tmp_factored_gsl_matrix);
625 
626  gsl_vector* tmp_factored_gsl_vector = gsl_vector_alloc(_layer_count);
627  for (ii=0;ii<_layer_count;++ii)
628  gsl_vector_set(tmp_factored_gsl_vector,ii,
629  gsl_matrix_get(tmp_factored_gsl_matrix,0,ii));
630 
631  //if (_verboseDebug)
632  //displayVector(tmp_factored_gsl_vector, "getvalue: tmp_factored_gsl_vector", true);
633 
634  // work out geomean from this point to all specimen localites
635  double tmp_geomean = getGeomean(tmp_factored_gsl_vector);
636 
637  gsl_matrix_free(tmp_centered_gsl_matrix);
638  gsl_matrix_free(tmp_factored_gsl_matrix);
639  gsl_vector_free(tmp_factored_gsl_vector);
640 
641  //Log::instance()->info( "getValue: geomean %6.2f\n", tmp_geomean );
642 
643 
644  /* % OK, now to convert these into habitat suitability values.
645  % Determine what percentage of the species presence points
646  % are further away from zero than this point */
647 
648  //cellcount=0;
649 
650  //old version checking every value on unsorted vector
651  //for (ii=0; ii<_localityCount; ++ii)
652  // if (gsl_vector_get(_gsl_geomean_vector,ii)>=tmp_geomean) cellcount+=1;
653 
654  // step up the sorted geomeans vector stop when current geomean is higher
655  // quickest when most cells are of low suitability
656  cellcount=_localityCount;
657  for (ii=_localityCount-1; ii>=0; --ii)
658  if (gsl_vector_get(_gsl_geomean_vector,ii)<=tmp_geomean)
659  {
660  cellcount=_localityCount-ii-1;
661  break;
662  }
663 
664  //Log::instance()->info( "getValue: cellcount %i\n", cellcount );
665  Scalar myReturn=(Scalar)cellcount/_localityCount;
666 
667  //Log::instance()->info( "enfa::getValue %6.2f\n", myReturn );
668 
669  return myReturn;
670 }
671 
678 int Enfa::getConvergence( Scalar * const val ) const
679 {
680  return 0;
681 }
682 
683 //
684 // General Helper Methods
685 //
686 
687 
688 void Enfa::displayVector(const gsl_vector * v, const char * name, const bool roundFlag) const
689 {
690  if (roundFlag)
691  {
692 
693  fprintf( stderr, "\nDisplaying Vector rounded to 4 decimal places '%s' (%u): \n----------------------------------------------\n[ ", name, static_cast<unsigned int>(v->size) );
694  }
695  else
696  {
697  fprintf( stderr, "\nDisplaying Vector '%s' (%u): \n----------------------------------------------\n[ ", name, static_cast<unsigned int>(v->size) );
698  }
699 
700  char sep1[] = ", ";
701 
702  for (unsigned int i=0;i<v->size;++i)
703  {
704  if (i == v->size -1)
705  strcpy(sep1, " ]");
706 
707  double myDouble = gsl_vector_get (v,i);
708  if (roundFlag)
709  {
710  fprintf( stderr, "%.4g %s", myDouble, sep1 );
711  }
712  else
713  {
714  fprintf( stderr, "%g %s", myDouble, sep1 );
715  }
716  }
717  fprintf( stderr, "\n----------------------------------------------\n" );
718 }
719 
720 
721 /**********************/
722 /**** displayMatrix ***/
723 void Enfa::displayMatrix(const gsl_matrix * m, const char * name, const bool roundFlag) const
724 {
725  if (!roundFlag)
726  {
727  fprintf( stderr, "\nDisplaying Matrix '%s' (%u / %u): \n----------------------------------------------\n[\n", name, static_cast<unsigned int>(m->size1), static_cast<unsigned int>(m->size2) );
728  }
729  else
730  {
731  fprintf( stderr, "\nDisplaying Matrix rounded to 6 decimal places '%s' (%u / %u): \n----------------------------------------------\n[\n", name, static_cast<unsigned int>(m->size1), static_cast<unsigned int>(m->size2) );
732  }
733  for (unsigned int i=0;i<m->size1;++i)
734  {
735  char sep1[] = ",";
736  char sep2[] = ";";
737 
738  for (unsigned int j=0;j<m->size2;j++)
739  {
740  double myDouble = gsl_matrix_get (m,i,j);
741 
742  if (j == m->size2 -1)
743  strcpy(sep1, "");
744  if (!roundFlag)
745  {
746  fprintf( stderr, "%g %s ", myDouble, sep1 );
747  }
748  else
749  {
750  fprintf( stderr, "%.6g %s ", myDouble, sep1 );
751  }
752  }
753 
754  fprintf( stderr, "%s\n", sep2 );
755  }
756  fprintf( stderr, "]\n----------------------------------------------\n" );
757 }
758 
759 /**********************/
760 /**** displayMatrix ***/
761 void Enfa::displayLoadings(const gsl_matrix * m, const int f) const
762 {
763  Log::instance()->info( "Factor Loadings:\n");
764 
765  // generate header
766  Log::instance()->info( "Var,Marg");
767  //fprintf( stderr, "Var,Marg");
768 
769  for (signed int j=1;j<f;j++)
770  {
771  fprintf( stderr, ",Sp-%i", j);
772  }
773 
774  fprintf( stderr, "\n");
775 
776  for (unsigned int i=0;i<m->size1;++i)
777  {
778  Log::instance()->info( "%i",i);
779  //fprintf( stderr, "%i",i);
780  for (signed int j=0;j<f;j++)
781  {
782  double myDouble = gsl_matrix_get (m,i,j);
783 
784  fprintf( stderr, ",%3.3f", myDouble);
785  }
786  fprintf( stderr, "\n");
787  }
788 }
789 
790 /*****************************************************************
791  function to find the 'square-root' of a matrix:
792  the input matrix must be positive semi-definite symmetric matrix
793  note that covariance matrices have these properties */
794 gsl_matrix* Enfa::sqrtm(gsl_matrix* original_matrix) const
795 {
796  int m_size, i, j;
797  double lambda, v;
798 
799  /* make a copy of the input matrix */
800  // Build a copy of the input matrix to work with
801  m_size = original_matrix->size1;
802 
803  gsl_matrix* m = gsl_matrix_alloc (m_size, m_size);
804  gsl_matrix_memcpy (m, original_matrix);
805 
806  //create temporary space for eigenvalues, eigenvectors & workspace
807  gsl_vector* eigval_v = gsl_vector_alloc (m_size);
808  gsl_matrix* eigvect_m = gsl_matrix_alloc (m_size, m_size);
809  gsl_eigen_symmv_workspace * temp_v = gsl_eigen_symmv_alloc (m_size);
810 
811  gsl_eigen_symmv (m,
812  eigval_v,
813  eigvect_m,
814  temp_v);
815 
816  gsl_eigen_symmv_free(temp_v);
817 
818  /* allocate and initialise a new matrix temp_m */
819  /* this is eigvect_m scaled by the root of the eigenvalues */
820  gsl_matrix* temp_m = gsl_matrix_alloc (m_size, m_size);
821  for (j=0; j<m_size; ++j) {
822  // ignore very small negative numbers
823  v=gsl_vector_get(eigval_v,j);
824  if (v<0 && v>-0.000001)
825  {lambda=0.0;}
826  else if (v>0)
827  {lambda=pow(v,0.5);}
828  else
829  {
830  gsl_matrix_free(temp_m);
831  gsl_matrix_free(m);
832  gsl_matrix_free(eigvect_m);
833  gsl_vector_free(eigval_v);
834  std::string msg = "Enfa::sqrtm:Cannot calculate square root for matrix - model will fail\n", v;
835  Log::instance()->error( msg.c_str() );
836  throw InverseFailedException( msg.c_str() );
837  }
838 
839  for (i=0; i<m_size; ++i) {
840  gsl_matrix_set(temp_m, i, j, gsl_matrix_get(eigvect_m,i,j)*lambda);
841  }
842  }
843 
844  /* calculate the square root =temp_m * eigvect' */
845  gsl_matrix* _root = gsl_matrix_alloc (m_size, m_size);
846  // multiply using blas (note transpose for second item)
847  gsl_blas_dgemm(CblasNoTrans, CblasTrans,
848  1.0, temp_m, eigvect_m,
849  0.0, _root);
850 
851  /* code for checking the root - not really needed */
877  gsl_matrix_free(temp_m);
878  gsl_matrix_free(m);
879  gsl_matrix_free(eigvect_m);
880  gsl_vector_free(eigval_v);
881 
882  return _root;
883 
884 }
885 
886 /*****************************************************************
887  * Calculate the geometric mean of the distance from point v to
888  all species observation points in factored environmental space
889  weighting each factor accordingly
890 *****************************************************************/
891 double Enfa::getGeomean(gsl_vector* v) const
892 {
893 
894  //Log::instance()->info( "enfa::getGeomean\n" );
895 
896  //if (_verboseDebug)
897  //displayVector(v, "getGeomean input vector v:", true);
898 
899  gsl_vector * tmp_v = gsl_vector_alloc(_retained_components_count);
900  gsl_vector * tmp_gsl_factor_weights = gsl_vector_alloc(_retained_components_count);
901 
902  for (int i=0; i<_retained_components_count; ++i)
903  {
904  gsl_vector_set(tmp_v, i,gsl_vector_get(v,i));
905  gsl_vector_set(tmp_gsl_factor_weights, i, gsl_vector_get(_gsl_factor_weights,i));
906  }
907 
908  gsl_vector * tmp_distance_vector = gsl_vector_alloc(_retained_components_count);
909  //gsl_vector * tmp_distance_vector = gsl_vector_alloc(_layer_count);
910 
911  gsl_vector_view tmp_matrix_row_view;
912  double tmp_geomean=1.0;
913  double tmp_dist_workspace;
914 
915  // loop through localities
916  for (int i=0; i<_localityCount; ++i)
917  {
918  tmp_matrix_row_view = gsl_matrix_row(_gsl_environment_factor_matrix,i);
919  //displayVector(&tmp_matrix_row_view.vector, "getGeomean: locality vector i", true);
920  //for (int j=0; j<_layer_count; ++j)
921  // dont need to calculate beyonde the retained components
922  for (int j=0; j<_retained_components_count; ++j)
923  gsl_vector_set(tmp_distance_vector, j,
924  gsl_vector_get(&tmp_matrix_row_view.vector,j));
925  //displayVector(tmp_distance_vector, "getGeomean: locality vector i copy ", true);
926 
927  // difference between cell and current locality
928  gsl_vector_sub(tmp_distance_vector,tmp_v);
929  //gsl_vector_sub(tmp_distance_vector,v);
930  //displayVector(tmp_distance_vector, "getGeomean: locality vector i subbed ", true);
931 
932  // square the differences
933  gsl_vector_mul(tmp_distance_vector, tmp_distance_vector);
934  //displayVector(tmp_distance_vector, "getGeomean: locality vector i squared ", true);
935  // multiply by the factor weights
936  gsl_vector_mul(tmp_distance_vector,tmp_gsl_factor_weights);
937  //gsl_vector_mul(tmp_distance_vector,_gsl_factor_weights);
938  // square and sum the values to get the distance
939  //displayVector(tmp_distance_vector, "getGeomean: locality vector i weighted ", true);
940 
941  tmp_dist_workspace=pow(gsl_blas_dasum(tmp_distance_vector),0.5);
942  //Log::instance()->info( "euclidean distance: %6.2f\n", tmp_dist_workspace );
943 
944  // accumulate for geometric mean calculation
945  // if (tmp_dist_workspace!=0) tmp_geomean*=tmp_dist_workspace;
946  // log transform values to reduce chance of float overflow
947  if (tmp_dist_workspace!=0) tmp_geomean+=log(tmp_dist_workspace);
948  //Log::instance()->info( "accumulating distance: %6.2f\n", tmp_geomean );
949  }
950 
951  //Log::instance()->info( "tmp_geomean1: %6.2f\n", tmp_geomean );
952 
953  // finally work out geometric mean of distances to the species points
954  // tmp_geomean=pow(tmp_geomean,(double)1/_localityCount);
955  // reverse the log transformation
956  tmp_geomean=exp(tmp_geomean/_localityCount);
957 
958  //if (_verboseDebug)
959  // Log::instance()->info( "getgeomean returns: %6.2f\n", tmp_geomean );
960 
961  gsl_vector_free(tmp_distance_vector);
962  gsl_vector_free(tmp_v);
963  gsl_vector_free(tmp_gsl_factor_weights);
964 
965  return tmp_geomean;
966 }
967 
968 /* calculate the inverse using cholesky decomposition */
969 gsl_matrix* Enfa::inverse(gsl_matrix* _m) const
970 {
971  // dont crash on error - catch it
972  gsl_error_handler_t* old_handler = gsl_set_error_handler_off();
973 
974  int m_size=_m->size1;
975  gsl_matrix*_mcopy = gsl_matrix_alloc(m_size, m_size);
976  gsl_matrix*_inverse = gsl_matrix_alloc(m_size, m_size);
977  gsl_matrix_set_identity(_inverse);
978  gsl_matrix_memcpy(_mcopy,_m);
979  _gsl_vector_view _MI;
980 
981  int choldecomp;
982  choldecomp = gsl_linalg_cholesky_decomp(_mcopy);
983  Log::instance()->debug( "Cholesky decomp result %i\n", choldecomp );
984 
985  int cholsvx;
986  for (int i=0; i<m_size; i++)
987  {
988  _MI=gsl_matrix_row(_inverse,i);
989  cholsvx = gsl_linalg_cholesky_svx(_mcopy, &_MI.vector);
990  Log::instance()->debug( "Cholesky svx result %i\n", cholsvx );
991  //displayVector(&_MI.vector, "&MI.vector", true);
992  }
993 
994 
995  // sometimes the inverse fails - check it is correct
996  gsl_matrix *_testInverse = gsl_matrix_alloc(m_size, m_size);
997  gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0,
998  _m,_inverse,0.0,_testInverse);
999 
1000  int mycomp, myval;
1001  for (int i=0; i<m_size; i++)
1002  for (int j=0; j<m_size; j++)
1003  {
1004  // check values equal identity matrix (to accuracy of Xdp)
1005  myval=(int)(ceil(gsl_matrix_get(_testInverse,i,j)*pow((double)10,6)-0.49999999)/pow((double)10,6));
1006  mycomp = (i == j) ? 1 : 0;
1007  if (myval!=mycomp)
1008  {
1009  Log::instance()->warn( "Enfa::inverse failed to invert matrix!\n" );
1010  //displayMatrix(_m, "input matrix", true);
1011  //displayMatrix(_inverse, "inverse", true);
1012  //displayMatrix(_testInverse, "check inverse", true);
1013  gsl_matrix_free(_mcopy);
1014  gsl_matrix_free(_testInverse);
1015  std::string msg = "Enfa::inverse failed to invert matrix\n";
1016  Log::instance()->error( msg.c_str() );
1017  throw InverseFailedException( msg.c_str() );
1018  }
1019  }
1020 
1021  gsl_matrix_free(_mcopy);
1022  gsl_matrix_free(_testInverse);
1023 
1024  // restore error handler
1025  gsl_set_error_handler (old_handler);
1026 
1027  return _inverse;
1028 }
1029 
1030 /***********************/
1031 /**** autoCovariance ***/
1032 /* calculate covariance matrix as cov(<matrix>) matlab/octave function */
1033 gsl_matrix * Enfa::autoCovariance(gsl_matrix * original_matrix)
1034 {
1035  int j;
1036 
1037  int numrows = original_matrix->size1;
1038  int numcols = original_matrix->size2;
1039 
1040  // Build a copy of the input matrix to work with
1041  gsl_matrix * m = gsl_matrix_alloc (numrows, numcols);
1042  gsl_matrix_memcpy (m, original_matrix);
1043 
1044 
1045  if (numrows == 1)
1046  {
1047  gsl_matrix_transpose(m);
1048  }
1049 
1050  // compute: ones (n, 1) * sum (x)
1051  gsl_matrix * s = gsl_matrix_alloc (numrows, numcols);
1052 
1053  if (numrows == 1)
1054  {
1055  gsl_matrix_memcpy (m, s);
1056  }
1057  else
1058  {
1059  gsl_vector * v = gsl_vector_alloc(numcols);
1060 
1061  for (int i = 0; i < numcols; i++)
1062  {
1063  double val = 0.0;
1064 
1065  for (j = 0; j < numrows; j++)
1066  {
1067  val += gsl_matrix_get (m, j, i);
1068  }
1069 
1070  gsl_vector_set (v, i, val);
1071 
1072  for (j = 0; j < numrows; j++)
1073  {
1074  gsl_matrix_set_row (s, j, v);
1075  }
1076  }
1077  gsl_vector_free(v);
1078  }
1079 
1080  // divide by "n"
1081  gsl_matrix_scale (s, (double)1/numrows);
1082 
1083  // subtract the result from x
1084  gsl_matrix_sub (m, s);
1085 
1086  // x / (n - 1)
1087  //gsl_matrix_scale (m, (double)1/(numrows-1));
1088 
1089  // multiply by x'
1090  gsl_matrix * p = gsl_matrix_alloc (numcols, numcols);
1091  gsl_blas_dgemm(CblasTrans,CblasNoTrans,1.0,
1092  m,m,0.0,p);
1093 
1094  // scale the result
1095  gsl_matrix_scale (p, (double)1/(numrows-1));
1096 
1097  gsl_matrix_free (m);
1098  gsl_matrix_free (s);
1099 
1100  return p;
1101 }
1102 
1103 /**********************************************/
1104 /* decide which components to retain/discard */
1105 /* using the method defined by _discardMethod */
1106 /**********************************************/
1107 int Enfa::discardComponents() const
1108 {
1109  double variationTotal=0;
1110  int returnValue=_layer_count;
1111 
1112  // take the top X components
1113  if (_discardMethod==0)
1114  {
1115  Log::instance()->info( "Discarding components with fixed method (0)\n");
1116  returnValue=_retainComponents;
1117  for (int i=0; i<_retainComponents; ++i)
1118  variationTotal+=gsl_vector_get(_gsl_factor_weights_all_components,i);
1119  }
1120  // take the components that account for X % of variation
1121  else if (_discardMethod==1)
1122  {
1123  Log::instance()->info( "Discarding components with variation method (1) (variation=%6.2f)\n", _retainVariation);
1124  for (int i=0; i<_layer_count; ++i)
1125  {
1126  variationTotal+=gsl_vector_get(_gsl_factor_weights_all_components,i);
1127  if (variationTotal>=_retainVariation)
1128  {
1129  // variation exceeds specified amount - keep these components
1130  returnValue=i+1;
1131  break;
1132  }
1133  }
1134  }
1135  // take the components with variation higher than broken stick distribution
1136  else if (_discardMethod==2)
1137  {
1138  Log::instance()->info("Discarding components: Broken stick method (2)\n");
1139  /* work out the broken stick distribution of expected eigenvalues
1140  * based on Jackson's formula for the kth eigenvalue
1141  * bk = sum[i=k...p](1/i) where p=number of eigenvalues */
1142  /* use broken stick only for specialisation factors
1143  as often marginality weight is lower than other factors,
1144  but we always want to keep marginality as the first factor */
1145  gsl_vector* _brokenStickDist = gsl_vector_alloc(_layer_count-1);
1146  for (int i=0; i<_layer_count-1; ++i)
1147  {
1148  double tmp_total=0;
1149  for (int j=i+1; j<=_layer_count-1; ++j)
1150  tmp_total+=(double)1/j;
1151  gsl_vector_set(_brokenStickDist,i, (double)tmp_total/(_layer_count-1));
1152  }
1153 
1154  //displayVector(_brokenStickDist, "broken stick distribution", true);
1155 
1156  double myVariation;
1157  // now compare with the observed variation
1158  for (int i=0; i<_layer_count-1; ++i)
1159  {
1160  //myVariation=gsl_vector_get(_gsl_factor_weights_all_components,i);
1161  // rescale specificity ignoring marginality (first) factor
1162  myVariation=gsl_vector_get(_gsl_factor_weights_all_components,i+1)/(1-gsl_vector_get(_gsl_factor_weights_all_components,0));
1163  //Log::instance()->info("Orig variation %6.2f, new variation %6.2f\n", gsl_vector_get(_gsl_factor_weights_all_components,i+1), myVariation);
1164 
1165  if (myVariation <= gsl_vector_get(_brokenStickDist,i))
1166  {
1167  if (i==0) // problem - no components selected
1168  {
1169  Log::instance()->warn( "First specialisation component explains less variation than the broken stick distribution - retaining all components by default\n" );
1170  variationTotal=1;
1171  returnValue=_layer_count;
1172  break;
1173  }
1174  else // this component is discarded but keep earlier ones
1175  {
1176  returnValue=i+1;
1177  break;
1178  }
1179  }
1180  variationTotal+=myVariation;
1181  }
1182  gsl_vector_free(_brokenStickDist);
1183  }
1184  else
1185  {
1186  Log::instance()->warn( "Unknown problem whilst discarding components, retaining all components by default\n");
1187  variationTotal=1;
1188  }
1189  Log::instance()->info( "Retained components: %i/%i, variation explained: %.2f\n", returnValue, _layer_count, variationTotal);
1190  return returnValue;
1191 }
1192 
1193 
1194 
1196 bool Enfa::enfa1()
1197 {
1198 
1199  //calculate the mean and std deviation for background points
1200  //Log::instance()->info( "Calculating mean and stddev for background points\n" );
1201  _gsl_avg_background_vector = gsl_vector_alloc (_gsl_background_matrix->size2);
1202  _gsl_stddev_background_vector = gsl_vector_alloc (_gsl_background_matrix->size2) ;
1203  calculateMeanAndSd(_gsl_background_matrix,_gsl_avg_background_vector,_gsl_stddev_background_vector);
1204 
1205  if (_verboseDebug)
1206  {
1207  displayVector(_gsl_avg_background_vector, "Background Means", true);
1208  displayVector(_gsl_stddev_background_vector, "Background Stdev", true);
1209  }
1210 
1211  //calculate the mean and std deviation for centered presence points
1212  _gsl_avg_vector = gsl_vector_alloc (_gsl_environment_matrix->size2);
1213  _gsl_stddev_vector = gsl_vector_alloc (_gsl_environment_matrix->size2) ;
1214  calculateMeanAndSd(_gsl_environment_matrix,_gsl_avg_vector,_gsl_stddev_vector);
1215 
1216  if (_verboseDebug)
1217  {
1218  displayVector(_gsl_avg_vector, "Species Means", true);
1219  displayVector(_gsl_stddev_vector, "Species Stdev", true);
1220  }
1221 
1222  //center and standardise the data based on background means
1223  center(_gsl_environment_matrix,_localityCount);
1224  center(_gsl_background_matrix,_backgroundCount);
1225 
1226  //displayMatrix(_gsl_background_matrix, "_gsl_background_matrix", true);
1227 
1228  //calculate the mean and std deviation for centered presence points
1229  calculateMeanAndSd(_gsl_environment_matrix,_gsl_avg_vector,_gsl_stddev_vector);
1230 
1231  if (_verboseDebug)
1232  {
1233  displayVector(_gsl_avg_vector, "Species Means - centered", true);
1234  displayVector(_gsl_stddev_vector, "Species Stdev - centered", true);
1235  }
1236 
1237  //Now calculate the covariance matrix:
1238  //Log::instance()->info( "Calculating covariance matrix\n" );
1239  _gsl_covariance_matrix = autoCovariance(_gsl_environment_matrix);
1240  if (_verboseDebug)
1241  displayMatrix(_gsl_covariance_matrix, "Species Covariance Matrix", true);
1242 
1243  // now get covariance for background data
1244  _gsl_covariance_background_matrix = autoCovariance(_gsl_background_matrix);
1245  if (_verboseDebug)
1246  displayMatrix(_gsl_covariance_background_matrix, "Background Covariance Matrix", true);
1247 
1248  // get root inverse of species covariance matrix
1249  _gsl_covariance_matrix_root_inverse = gsl_matrix_alloc(_gsl_covariance_matrix->size1,_gsl_covariance_matrix->size2);
1250  gsl_matrix* gsl_sqrtm_matrix = sqrtm(_gsl_covariance_matrix);
1251  _gsl_covariance_matrix_root_inverse = inverse(gsl_sqrtm_matrix);
1252  gsl_matrix_free(gsl_sqrtm_matrix);
1253 
1254  //z = cov_species^-0.5 * m';
1255  _gsl_workspace_z = gsl_vector_alloc (_layer_count);
1256  gsl_blas_dgemv (CblasNoTrans, 1.0, _gsl_covariance_matrix_root_inverse, _gsl_avg_vector, 0.0, _gsl_workspace_z);
1257 
1258  if (_verboseDebug)
1259  displayVector(_gsl_workspace_z, "z", true);
1260 
1261  //W = cov_species^-0.5 * cov_global * cov_species^-0.5;
1262  gsl_matrix* _gsl_workspace_W_temp = gsl_matrix_alloc(_layer_count, _layer_count);
1263  _gsl_workspace_W = gsl_matrix_alloc(_layer_count, _layer_count);
1264  gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0,
1265  _gsl_covariance_matrix_root_inverse,
1266  _gsl_covariance_background_matrix,0.0,
1267  _gsl_workspace_W_temp);
1268 
1269  gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0,
1270  _gsl_workspace_W_temp,
1271  _gsl_covariance_matrix_root_inverse,0.0,
1272  _gsl_workspace_W);
1273 
1274  gsl_matrix_free(_gsl_workspace_W_temp);
1275 
1276  if (_verboseDebug)
1277  displayMatrix(_gsl_workspace_W, "W", true);
1278 
1279  //y = z / sqrtm(z' * z);
1280  //require view of z as an Nx1 matrix rather than a vector
1281  gsl_matrix_view _gsl_workspace_z_matrix_view;
1282  _gsl_workspace_z_matrix_view = gsl_matrix_view_vector(_gsl_workspace_z,
1283  1,_layer_count);
1284  double zz;
1285  gsl_blas_ddot(_gsl_workspace_z, _gsl_workspace_z, &zz);
1286 
1287  _gsl_workspace_y = gsl_matrix_alloc(1,_layer_count);
1288  gsl_matrix_memcpy(_gsl_workspace_y, &_gsl_workspace_z_matrix_view.matrix);
1289  gsl_matrix_scale(_gsl_workspace_y,pow(zz, -0.5));
1290 
1291  if (_verboseDebug)
1292  displayMatrix(_gsl_workspace_y, "y", true);
1293 
1294  //eye(N)=identity matrix with dimension NxN (use gsl_matrix_set_identity())
1295  //H = (eye(no_egvs) - y * y') * W * (eye(no_egvs) - y * y');
1296  _gsl_workspace_H = gsl_matrix_alloc(_layer_count,_layer_count);
1297  gsl_matrix* _gsl_workspace_H_temp1 = gsl_matrix_alloc(_layer_count,_layer_count);
1298  gsl_matrix* _gsl_workspace_H_temp2 = gsl_matrix_alloc(_layer_count,_layer_count);
1299  gsl_matrix* _gsl_workspace_H_temp3 = gsl_matrix_alloc(_layer_count,_layer_count);
1300 
1301  //initialise identity matrix
1302  gsl_matrix_set_identity(_gsl_workspace_H_temp1);
1303 
1304  // first step work out (y * y')
1305  gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0,
1306  _gsl_workspace_y,
1307  _gsl_workspace_y,
1308  0.0, _gsl_workspace_H_temp2);
1309 
1310  // now subtract from identity
1311  gsl_matrix_sub(_gsl_workspace_H_temp1,_gsl_workspace_H_temp2);
1312 
1313  // product of this with W
1314  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0,
1315  _gsl_workspace_H_temp1,
1316  _gsl_workspace_W,
1317  0.0, _gsl_workspace_H_temp3);
1318 
1319  // finally product of this with temp1
1320  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0,
1321  _gsl_workspace_H_temp3,
1322  _gsl_workspace_H_temp1,
1323  0.0, _gsl_workspace_H);
1324 
1325  gsl_matrix_free(_gsl_workspace_H_temp1);
1326  gsl_matrix_free(_gsl_workspace_H_temp2);
1327  gsl_matrix_free(_gsl_workspace_H_temp3);
1328 
1329  if (_verboseDebug)
1330  displayMatrix(_gsl_workspace_H, "H", true);
1331 
1332  //work out eigen-vectors/values of H
1333  _gsl_eigenvalue_vector = gsl_vector_alloc (_layer_count);
1334  gsl_vector * _gsl_eigenvalue_vector_copy = gsl_vector_alloc (_layer_count);
1335  _gsl_eigenvector_matrix = gsl_matrix_alloc (_layer_count, _layer_count);
1336  gsl_eigen_symmv_workspace * _gsl_eigen_workpace = gsl_eigen_symmv_alloc (_layer_count);
1337  gsl_eigen_symmv (_gsl_workspace_H,
1338  _gsl_eigenvalue_vector,
1339  _gsl_eigenvector_matrix,
1340  _gsl_eigen_workpace);
1341 
1342  //free the temporary workspace
1343  gsl_eigen_symmv_free (_gsl_eigen_workpace);
1344 
1345  //create score_matrix = cov_species^-0.5 * v;
1346  _gsl_score_matrix=gsl_matrix_alloc(_layer_count,_layer_count);
1347  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0,
1348  _gsl_covariance_matrix_root_inverse,
1349  _gsl_eigenvector_matrix,
1350  0.0, _gsl_score_matrix);
1351 
1352  /* sort eigenvalues, eigenvectors and score matrix */
1353  gsl_vector * _gsl_eigenvalue_vector_presort = gsl_vector_alloc(_layer_count);
1354  gsl_vector_memcpy(_gsl_eigenvalue_vector_presort, _gsl_eigenvalue_vector);
1355  gsl_eigen_symmv_sort (_gsl_eigenvalue_vector, _gsl_eigenvector_matrix,
1356  GSL_EIGEN_SORT_VAL_DESC);
1357  gsl_eigen_symmv_sort (_gsl_eigenvalue_vector_presort, _gsl_score_matrix,
1358  GSL_EIGEN_SORT_VAL_DESC);
1359 
1360  //Log::instance()->info( "Eigenvector sorted\n" );
1361 
1362  /* Replace the min eigenvalue with the difference of the trace of W & H
1363  this becomes eigenvalue 1, the remaining eigenvalues move to index 2+
1364  t = eigenvalues ~= min(eigenvalues);
1365  eigenvalues(2:no_egvs) = eigenvalues(t);
1366  eigenvalues(1) = trace(W) - trace(H)*/
1367 
1368  _gsl_vector_min = gsl_vector_min_index(_gsl_eigenvalue_vector);
1369  //Log::instance()->info( "min eigevalue: %i\n", _gsl_vector_min);
1370  //Log::instance()->info( "min eigevalue: %6.2f\n", gsl_vector_min(_gsl_eigenvalue_vector));
1371 
1372  gsl_vector_memcpy (_gsl_eigenvalue_vector_copy,
1373  _gsl_eigenvalue_vector);
1374 
1375  // note starting at 1 as position 0 will be overwritten later
1376  for (int i=1; i<_layer_count; i++)
1377  {
1378  if (_gsl_vector_min>=i)
1379  {
1380  gsl_vector_set(_gsl_eigenvalue_vector,i,
1381  gsl_vector_get(_gsl_eigenvalue_vector_copy,i-1));
1382  }
1383  }
1384 
1385  //displayVector(_gsl_eigenvalue_vector, "_gsl_eigenvalue_vector:A", true);
1386  //displayVector(_gsl_eigenvalue_vector_copy, "_gsl_eigenvalue_vector_copy:A", true);
1387 
1388  gsl_vector_free(_gsl_eigenvalue_vector_copy);
1389 
1390  // now set first eigenvalue to trace(W) - trace(H)
1391  double traceW=0.0;
1392  for (int i=0; i<_layer_count; ++i) traceW+=gsl_matrix_get(_gsl_workspace_W, i, i);
1393  double traceH=0.0;
1394  for (int i=0; i<_layer_count; ++i) traceH+=gsl_matrix_get(_gsl_workspace_H, i, i);
1395  gsl_vector_set(_gsl_eigenvalue_vector,0,traceW-traceH);
1396 
1397  /* % Figure out which eigenvalue has gone AWOL */
1398 
1399  /* remove column associated with dodgy eigenvector (_gsl_vector_min) */
1400  /* Replace the first column with the vector of means */
1401  /* and the move other columns down */
1402  gsl_matrix * _gsl_score_matrix_copy=gsl_matrix_alloc(_layer_count,_layer_count);
1403  gsl_matrix_memcpy(_gsl_score_matrix_copy, _gsl_score_matrix);
1404 
1405  for (int i=0; i<_layer_count; ++i)
1406  for (int j=0; j<_layer_count; ++j)
1407  {
1408  if (j==0)
1409  {
1410  gsl_matrix_set(_gsl_score_matrix,i,j,gsl_vector_get(_gsl_avg_vector,i));
1411  }
1412  else if (_gsl_vector_min>=j)
1413  {
1414  gsl_matrix_set(_gsl_score_matrix,i,j,
1415  gsl_matrix_get(_gsl_score_matrix_copy,i,j-1));
1416  }
1417  }
1418 
1419  gsl_matrix_free(_gsl_score_matrix_copy);
1420 
1421  double score2norm;
1422  /* Norm the columns of the score_matrix
1423  - that is divide each value by the 2-norm of its column */
1424  for (int i=0; i<_layer_count; ++i)
1425  {
1426  // work out the 2-norm for this column = sqrt(sum(x^2))
1427  double unorm=0.0;
1428  for (int j=0; j<_layer_count; ++j)
1429  unorm+=pow(gsl_matrix_get(_gsl_score_matrix,j,i),2);
1430  unorm=pow(unorm,0.5);
1431 
1432  //Log::instance()->info( "col i: %i norm: %6.2f\n", i, unorm);
1433 
1434  // and apply this normalisation to the matrix
1435  for (int j=0; j<_layer_count; ++j)
1436  {
1437  score2norm = gsl_matrix_get(_gsl_score_matrix,j,i);
1438  //Log::instance()->info( "score 2 norm: %6.2f\n", score2norm);
1439  score2norm = score2norm/unorm;
1440  //Log::instance()->info( "score 2 normed: %6.2f\n", score2norm);
1441  gsl_matrix_set(_gsl_score_matrix,j,i,score2norm);
1442  }
1443  }
1444 
1445  if (_verboseDebug)
1446  {
1447  displayVector(_gsl_eigenvalue_vector, "Eigenvalues", true);
1448  displayMatrix(_gsl_eigenvector_matrix, "Eigenvectors", true);
1449  displayMatrix(_gsl_score_matrix, "Score Matrix", true);
1450  }
1451 
1452  /* marginality = geometric mean of means
1453  marginality = sqrt(sum(m.^2));
1454  marginality = marginality / 1.96 */
1455 
1456  _marginality=0.0;
1457  for (int i=0; i<_layer_count; ++i)
1458  _marginality+=pow(gsl_vector_get(_gsl_avg_vector,i),2);
1459  _marginality=pow(_marginality,0.5)/1.96;
1460  Log::instance()->info( "Marginality: %6.2f\n", _marginality );
1461 
1462  /* Calculate global specialization
1463  specialization = sqrt(sum(eigenvalues)/length(m)) */
1464  _specialisation=pow((gsl_blas_dasum(_gsl_eigenvalue_vector)/_layer_count),0.5);
1465  Log::instance()->info( "Specialisation: %6.2f\n", _specialisation );
1466 
1467 
1468  /* work out factor weights = eigenvalues/sum(eigenvalues)
1469  * dicarded components are given a zero weighting */
1470  _gsl_factor_weights_all_components = gsl_vector_alloc(_layer_count);
1471  for (int i=0; i<_layer_count; ++i)
1472  {
1473  gsl_vector_set(_gsl_factor_weights_all_components,i,
1474  fabs(gsl_vector_get(_gsl_eigenvalue_vector,i)));
1475  }
1476  gsl_vector_scale(_gsl_factor_weights_all_components, 1.0/gsl_blas_dasum(_gsl_factor_weights_all_components));
1477 
1478  //After the model is generated, we can discard unwanted components!
1479  _retained_components_count=discardComponents();
1480 
1481  /* work out factor weights = eigenvalues/sum(eigenvalues)
1482  * dicarded components are given a zero weighting */
1483  _gsl_factor_weights = gsl_vector_alloc(_layer_count);
1484  gsl_vector_set_zero(_gsl_factor_weights);
1485  for (int i=0; i<_retained_components_count; ++i)
1486  {
1487  gsl_vector_set(_gsl_factor_weights,i,
1488  fabs(gsl_vector_get(_gsl_eigenvalue_vector,i)));
1489  }
1490  gsl_vector_scale(_gsl_factor_weights, 1.0/gsl_blas_dasum(_gsl_factor_weights));
1491 
1492  // print factor weights
1493  for (int i=0; i<_retained_components_count; ++i)
1494  {
1495  if (i==0)
1496  {
1497  Log::instance()->info("Marginality (factor 0) weight: %.2f, variation explained: %.2f\n", gsl_vector_get(_gsl_factor_weights,i), gsl_vector_get(_gsl_factor_weights_all_components,i));
1498  }
1499  else
1500  {
1501  Log::instance()->info("Specialisation factor %i weight: %.2f, variation explained: %.2f\n", i, gsl_vector_get(_gsl_factor_weights,i), gsl_vector_get(_gsl_factor_weights_all_components,i));
1502  }
1503  }
1504 
1505  /* Display the variable loadings of each factor (score matrix) */
1506  if (_displayLoadings)
1507  {
1508  displayLoadings(_gsl_score_matrix, _retained_components_count);
1509  }
1510 
1511  /* work out geometric mean of the distance between each species locality and
1512  every other locality based on the factored data */
1513  _gsl_geomean_vector = gsl_vector_alloc(_localityCount);
1514  _gsl_environment_factor_matrix = gsl_matrix_alloc(_localityCount,_layer_count);
1515  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans,
1516  1.0, _gsl_environment_matrix, _gsl_score_matrix,
1517  0.0, _gsl_environment_factor_matrix);
1518 
1519  gsl_vector_view _gsl_geomean_workspace;
1520 
1521  double _tmp_geomean;
1522 
1523  //Log::instance()->info( "Calculating geometric means\n" );
1524  //displayMatrix(_gsl_environment_factor_matrix, "_gsl_environment_factor_matrix", true);
1525 
1526  // loop through the localities and calculate the geomean
1527  for (int i=0; i<_localityCount; ++i)
1528  {
1529  _gsl_geomean_workspace=gsl_matrix_row(_gsl_environment_factor_matrix,i);
1530  _tmp_geomean=getGeomean(&_gsl_geomean_workspace.vector);
1531  gsl_vector_set(_gsl_geomean_vector, i, _tmp_geomean);
1532  }
1533 
1534  // finally sort the geomeans to speed up processing later
1535  gsl_sort_vector(_gsl_geomean_vector);
1536 
1537  Log::instance()->info( "ENFA Model Generation Completed\n" );
1538 
1539  return true;
1540 
1541 }
1542 
1543 
1544 
1545 /****************************************************************/
1546 /****************** configuration *******************************/
1547 void
1548 Enfa::_getConfiguration( ConfigurationPtr& config ) const
1549 {
1550  if (!_done )
1551  return;
1552 
1553  ConfigurationPtr model_config( new ConfigurationImpl("Enfa") );
1554  config->addSubsection( model_config );
1555 
1556  // _marginality
1557  model_config->addNameValue( "Marginality", _marginality );
1558 
1559  // _specialisation
1560  model_config->addNameValue( "Specialisation", _specialisation );
1561 
1562  // _retained_components_count
1563  model_config->addNameValue( "RetainedComponents", _retained_components_count );
1564 
1565  double *values = new double[_layer_count];
1566 
1567  // _gsl_avg_background_vector
1568  for (int i=0; i < _layer_count; ++i)
1569  values[i] = gsl_vector_get(_gsl_avg_background_vector, i);
1570 
1571  model_config->addNameValue( "AvgBackgroundVector", values, _layer_count );
1572 
1573  // _gsl_stddev_background_vector
1574  for (int i=0; i < _layer_count; ++i)
1575  values[i] = gsl_vector_get(_gsl_stddev_background_vector, i);
1576 
1577  model_config->addNameValue( "StddevBackgroundVector", values, _layer_count );
1578 
1579  // _gsl_avg_vector
1580  for (int i=0; i < _layer_count; ++i)
1581  values[i] = gsl_vector_get(_gsl_avg_vector, i);
1582 
1583  model_config->addNameValue( "AvgVector", values, _layer_count );
1584 
1585  // _gsl_stddev_vector
1586  for (int i=0; i < _layer_count; ++i)
1587  values[i] = gsl_vector_get(_gsl_stddev_vector, i);
1588 
1589  model_config->addNameValue( "StddevVector", values, _layer_count );
1590 
1591  // _gsl_eigenvalue_vector
1592  for (int i=0; i < _layer_count; ++i)
1593  values[i] = gsl_vector_get(_gsl_eigenvalue_vector, i);
1594 
1595  model_config->addNameValue( "EigenvalueVector", values, _layer_count );
1596 
1597  // _gsl_factor_weights
1598  for (int i=0; i < _layer_count; ++i)
1599  values[i] = gsl_vector_get(_gsl_factor_weights, i);
1600 
1601  model_config->addNameValue( "FactorWeights", values, _layer_count );
1602 
1603  delete[] values;
1604 
1605  // _gsl_eigenvector_matrix
1606  int num_cells = _layer_count * _layer_count;
1607 
1608  double *flat_eigenvector_matrix = new double[num_cells];
1609 
1610  int cnt = 0;
1611 
1612  for (int i=0; i < _layer_count; ++i)
1613  for (int j=0; j < _layer_count; ++j, ++cnt)
1614  flat_eigenvector_matrix[cnt] = gsl_matrix_get( _gsl_eigenvector_matrix, i, j );
1615 
1616  model_config->addNameValue( "EigenvectorMatrix", flat_eigenvector_matrix, num_cells );
1617 
1618  delete[] flat_eigenvector_matrix;
1619 
1620  // _gsl_score_matrix
1621  num_cells = _layer_count * _layer_count;
1622 
1623  double *flat_score_matrix = new double[num_cells];
1624 
1625  cnt = 0;
1626 
1627  for (int i=0; i < _layer_count; ++i)
1628  for (int j=0; j < _layer_count; ++j, ++cnt)
1629  flat_score_matrix[cnt] = gsl_matrix_get( _gsl_score_matrix, i, j );
1630 
1631  model_config->addNameValue( "ScoreMatrix", flat_score_matrix, num_cells );
1632 
1633  delete[] flat_score_matrix;
1634 
1635  // _gsl_environment_factor_matrix
1636  num_cells = _localityCount * _layer_count;
1637 
1638  double *flat_environment_factor_matrix = new double[num_cells];
1639 
1640  cnt = 0;
1641 
1642  for (int i=0; i < _localityCount; ++i)
1643  for (int j=0; j < _layer_count; ++j, ++cnt)
1644  flat_environment_factor_matrix[cnt] = gsl_matrix_get( _gsl_environment_factor_matrix, i, j );
1645 
1646  model_config->addNameValue( "EnvironmentFactorMatrix", flat_environment_factor_matrix, num_cells );
1647 
1648  delete[] flat_environment_factor_matrix;
1649 
1650  // _gsl_geomean_vector
1651  values = new double[_localityCount];
1652 
1653  for (int i=0; i < _localityCount; ++i)
1654  values[i] = gsl_vector_get(_gsl_geomean_vector, i);
1655 
1656  model_config->addNameValue( "LocalityGeomeans", values, _localityCount );
1657 
1658  delete[] values;
1659 
1660 }
1661 
1662 void
1663 Enfa::_setConfiguration( const ConstConfigurationPtr& config )
1664 {
1665  ConstConfigurationPtr model_config = config->getSubsection( "Enfa",false );
1666 
1667  if (!model_config)
1668  return;
1669 
1670  // _retained_components_count
1671  _retained_components_count = model_config->getAttributeAsInt( "RetainedComponents", 0 );
1672 
1673  // _marginality
1674  _marginality = model_config->getAttributeAsDouble( "Marginality", 0.0 );
1675 
1676  // _specialisation
1677  _specialisation = model_config->getAttributeAsDouble( "Specialisation", 0.0 );
1678 
1679  // _gsl_avg_vector
1680  std::vector<double> stl_vector = model_config->getAttributeAsVecDouble( "AvgVector" );
1681 
1682  _layer_count = stl_vector.size();
1683 
1684  _gsl_avg_vector = gsl_vector_alloc( _layer_count );
1685 
1686  for (int i=0; i < _layer_count; ++i)
1687  gsl_vector_set( _gsl_avg_vector, i, stl_vector[i] );
1688 
1689  // _gsl_avg_background_vector
1690  stl_vector = model_config->getAttributeAsVecDouble( "AvgBackgroundVector" );
1691 
1692  _layer_count = stl_vector.size();
1693 
1694  _gsl_avg_background_vector = gsl_vector_alloc( _layer_count );
1695 
1696  for (int i=0; i < _layer_count; ++i)
1697  gsl_vector_set( _gsl_avg_background_vector, i, stl_vector[i] );
1698 
1699  // _gsl_stddev_vector
1700  stl_vector = model_config->getAttributeAsVecDouble( "StddevVector" );
1701 
1702  _gsl_stddev_vector = gsl_vector_alloc( _layer_count );
1703 
1704  for (int i=0; i < _layer_count; ++i)
1705  gsl_vector_set( _gsl_stddev_vector, i, stl_vector[i] );
1706 
1707  // _gsl_stddev_background_vector
1708  stl_vector = model_config->getAttributeAsVecDouble( "StddevBackgroundVector" );
1709 
1710  _gsl_stddev_background_vector = gsl_vector_alloc( _layer_count );
1711 
1712  for (int i=0; i < _layer_count; ++i)
1713  gsl_vector_set( _gsl_stddev_background_vector, i, stl_vector[i] );
1714 
1715  // _gsl_eigenvalue_vector
1716  stl_vector = model_config->getAttributeAsVecDouble( "EigenvalueVector" );
1717 
1718  _layer_count = stl_vector.size();
1719 
1720  _gsl_eigenvalue_vector = gsl_vector_alloc( _layer_count );
1721 
1722  for (int i=0; i < _layer_count; ++i)
1723  gsl_vector_set( _gsl_eigenvalue_vector, i, stl_vector[i] );
1724 
1725  // _gsl_factor_weights
1726  stl_vector = model_config->getAttributeAsVecDouble( "FactorWeights" );
1727 
1728  _layer_count = stl_vector.size();
1729 
1730  _gsl_factor_weights = gsl_vector_alloc( _layer_count );
1731 
1732  for (int i=0; i < _layer_count; ++i)
1733  gsl_vector_set( _gsl_factor_weights, i, stl_vector[i] );
1734 
1735  // _gsl_eigenvector_matrix
1736  stl_vector = model_config->getAttributeAsVecDouble( "EigenvectorMatrix" );
1737 
1738  _gsl_eigenvector_matrix = gsl_matrix_alloc( _layer_count, _layer_count );
1739 
1740  int cnt = 0;
1741 
1742  for (int i=0; i < _layer_count; ++i)
1743  for (int j=0; j < _layer_count; ++j, ++cnt)
1744  gsl_matrix_set( _gsl_eigenvector_matrix, i, j, stl_vector[cnt] );
1745 
1746 
1747  // _gsl_score_matrix
1748  stl_vector = model_config->getAttributeAsVecDouble( "ScoreMatrix" );
1749 
1750  _gsl_score_matrix = gsl_matrix_alloc( _layer_count, _layer_count );
1751 
1752  cnt = 0;
1753 
1754  for (int i=0; i < _layer_count; ++i)
1755  for (int j=0; j < _layer_count; ++j, ++cnt)
1756  gsl_matrix_set( _gsl_score_matrix, i, j, stl_vector[cnt] );
1757 
1758 
1759  // _gsl_environment_factor_matrix
1760  stl_vector = model_config->getAttributeAsVecDouble( "EnvironmentFactorMatrix" );
1761 
1762  _localityCount = stl_vector.size()/_layer_count;
1763 
1764  _gsl_environment_factor_matrix = gsl_matrix_alloc( _localityCount, _layer_count );
1765 
1766  cnt = 0;
1767 
1768  for (int i=0; i < _localityCount; ++i)
1769  for (int j=0; j < _layer_count; ++j, ++cnt)
1770  gsl_matrix_set( _gsl_environment_factor_matrix, i, j, stl_vector[cnt] );
1771 
1772 
1773  // _gsl_geomean_vector
1774  stl_vector = model_config->getAttributeAsVecDouble( "LocalityGeomeans" );
1775 
1776  _gsl_geomean_vector = gsl_vector_alloc( _localityCount );
1777 
1778  for (int i=0; i < _localityCount; ++i)
1779  gsl_vector_set( _gsl_geomean_vector, i, stl_vector[i] );
1780 
1781  _done = true;
1782 
1783 }
void warn(const char *format,...)
'Warn' level.
Definition: Log.cpp:273
double Scalar
Type of map values.
Definition: om_defs.hh:39
static AlgParamMetadata parameters[NUM_PARAM]
Definition: enfa.cpp:42
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
void error(const char *format,...)
'Error' level.
Definition: Log.cpp:290
OM_ALG_DLL_EXPORT AlgMetadata const * algorithmMetadata()
Definition: enfa.cpp:161
#define UNUSED(symbol)
Definition: os_specific.hh:55
static AlgMetadata metadata
Definition: enfa.cpp:132
#define NUM_PARAM
Definition: aquamaps.cpp:50
void info(const char *format,...)
'Info' level.
Definition: Log.cpp:256
#define BACKGROUND_ID
std::vector< OccurrencePtr >::const_iterator const_iterator
Definition: Occurrences.hh:85
OM_ALG_DLL_EXPORT AlgorithmImpl * algorithmFactory()
Definition: enfa.cpp:154
void debug(const char *format,...)
'Debug' level.
Definition: Log.cpp:237
Definition: Sample.hh:25