openModeller  Version 1.5.0
best_subsets.cpp
Go to the documentation of this file.
1 
30 #include "best_subsets.hh"
31 
32 #include <openmodeller/om.hh>
33 #include <openmodeller/Random.hh>
34 #include <openmodeller/Sampler.hh>
36 
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <math.h>
40 #include <string>
41 using std::string;
42 
43 #ifdef WIN32
44 #include <windows.h>
45 #define SLEEP(secs) Sleep(secs * 1000)
46 #else
47 #include <unistd.h>
48 #define SLEEP(secs) sleep(secs);
49 #endif
50 
51 
52 /****************************************************************/
53 void BestSubsets::printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns)
54 {
55  printf("%s\n", msg.c_str());
56  for (int i = 0; i < numOfRuns; i++)
57  printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i,
58  runs[i]->getOmission(),
59  runs[i]->getCommission(),
60  runs[i]->getId());
61 }
62 
63 /****************************************************************/
64 /****************** Garp class **********************************/
65 
67 : AlgorithmImpl(metadata)
68 {
69  _trainProp = 0.0;
70  _totalRuns = 0;
71  _omissionThreshold = 0.0;
75 
76  _softOmissionThreshold = false;
78 
79  _finishedRun = NULL;
80  _activeRun = NULL;
81  _bestRun = NULL;
82 
83  _nparam = 0;
84  _alg_params = NULL;
85 
86  _numFinishedRuns = 0;
87  _numActiveRuns = 0;
88  _done = false;
89 
90  _maxProgress = 0.0;
91 
92 }
93 
94 // ****************************************************************
96 {
97  int i;
98 
99  if (_finishedRun)
100  {
101  for (i = 0; i < _numFinishedRuns; i++)
102  { delete _finishedRun[i]; }
103  delete[] _finishedRun;
104  }
105 
106  if (_activeRun)
107  { delete[] _activeRun; }
108 
109  // bestRun just point to objects referenced by _finishedRun object
110  if (_bestRun)
111  { delete[] _bestRun; }
112 
113  if (_alg_params)
114  delete[] _alg_params;
115 }
116 
117 // ****************************************************************
118 // ************* initialize ***************************************
119 
121 {
122  // BS parameters
123  if (!getParameter("TrainingProportion", &_trainProp)) {
124  Log::instance()->error("Parameter TrainingProportion not set properly.\n");
125  return 0;
126  }
127 
128  if (!getParameter("TotalRuns", &_totalRuns)) {
129  Log::instance()->error("Parameter TotalRuns not set properly.\n");
130  return 0;
131  }
132 
133  if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) {
134  Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n");
135  return 0;
136  }
137 
138  if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) {
139  Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n");
140  return 0;
141  }
142 
143  if (!getParameter("CommissionThreshold", &_commissionThreshold)) {
144  Log::instance()->error("Parameter CommissionThreshold not set properly.\n");
145  return 0;
146  }
147 
148  if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) {
149  Log::instance()->error("Parameter CommissionSampleSize not set properly.\n");
150  return 0;
151  }
152 
153  if (!getParameter("MaxThreads", &_maxThreads)) {
154  Log::instance()->error("Parameter MaxThreads not set properly.\n");
155  return 0;
156  }
157 
158  if ( _maxThreads < 1 )
159  {
160  _maxThreads = 1;
161  }
162  else if ( _maxThreads > 1 )
163  {
164  // When maxThreads is greater than 1, if the machine has only one processor om
165  // can crash. If the machine has more than one processor GDAL can output lots
166  // of IO errors (current GDAL version does not seem to be thread safe).
167  Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n");
168  }
169 
170  if (_trainProp <= 1.0)
171  {
172  Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n");
173  }
174 
175  // convert percentages (100%) to proportions (1.0) for external parameters
176  _trainProp /= 100.0;
177  _commissionThreshold /= 100.0;
178  _omissionThreshold /= 100.0;
179 
182  {
183  Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns);
185  }
186 
189 
191 
192  return 1;
193 }
194 
195 /****************************************************************/
196 /****************** iterate *************************************/
197 
199 {
200  static int iterations = 0;
201  int active;
202  AlgorithmRun * algRun;
203 
204  ++iterations;
205 
206  if (_done)
207  { return 1; }
208 
209  // check if it should start new runs
212  {
213  // it needs to start more runs
214  // wait for a slot for a new thread
215  if ((active = numActiveThreads()) >= _maxThreads)
216  {
217  //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads);
218  SLEEP(2);
219  }
220 
221  else
222  {
223  int runId = _numFinishedRuns + _numActiveRuns;
224 
225  //Log::instance()->info("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId);
226 
227  // start new Algorithm
228  SamplerPtr train, test;
229  splitSampler( _samp, &train, &test, _trainProp );
230 
231  Log::instance()->debug( "Presences: orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence() );
232  Log::instance()->debug( "Absences: orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence() );
233 
234  algRun = new AlgorithmRun();
235  algRun->initialize(runId,
236  _commissionSampleSize, train, test,
238  (BSAlgorithmFactory *) this);
239  _activeRun[_numActiveRuns++] = algRun;
240  algRun->run();
241  }
242  }
243 
244  else
245  {
246  // no more runs are needed
247  // check if all active threads have finished
248  active = numActiveThreads();
249  if (active)
250  {
251  // there are still threads running
252  /*
253  Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n",
254  iterations, active);
255  */
256  SLEEP(2);
257  }
258 
259  else
260  {
261  // all running threads terminated
262  // calculate best subset and exit
263  //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", i);
265  _done = true;
266  }
267  }
268 
269  return 1;
270 }
271 
272 /****************************************************************/
274 {
275  int i;
276  AlgorithmRun * run;
277 
278  for (i = 0; i < _numActiveRuns; i++)
279  {
280  run = _activeRun[i];
281 
282  if (!run->running())
283  {
284  //Log::instance()->info("Thread %d has just finished.\n", run->getId());
285 
286  // run finished its work
287  // move it to finished runs
288  // and remove it from list of active runs
291  _activeRun[_numActiveRuns] = NULL;
292 
293  // update count of models under omission threshold
295  {
296  if (run->getOmission() <= _omissionThreshold)
298  }
299  }
300  }
301 
302  return _numActiveRuns;
303 }
304 
305 /****************************************************************/
307 {
308  return (!_softOmissionThreshold) &&
310 }
311 
312 /****************************************************************/
314 {
315  int i;
316 
317  Log::instance()->info("Calculating best subset of models.\n");
318 
319  // make a copy of finished runs to play with
320  AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns];
321  for (i = 0; i < _numFinishedRuns; i++)
322  { runList[i] = _finishedRun[i]; }
323 
324  printListOfRuns("Finished Runs:", runList, _numFinishedRuns);
325 
326  // get list of models that pass omission test
327  // sort runs by omission
328  // first <_modelsUnderOmission> runs are the selected ones
329  sortRuns(runList, _numFinishedRuns, 0);
330 
331  printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns);
332 
333  // get list of models that pass commission test
334  sortRuns(runList, _modelsUnderOmission, 1);
335 
336  printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns);
337 
338  _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 );
339  int medianRun = _modelsUnderOmission / 2;
340  int firstRun = (int)ceil( (double)medianRun - (double)_numBestRuns / 2.0 );
341 
343 
344  for (i = 0; i < _numBestRuns; i++)
345  { _bestRun[i] = runList[i + firstRun]; }
346 
347  printListOfRuns("Best Runs:", _bestRun, _numBestRuns);
348 
349  printf("Median: %d First: %d\n", medianRun, firstRun);
350 
351 
352  delete[] runList;
353 
354  Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns);
355 
356  return 1;
357 }
358 
359 /****************************************************************/
361  int nelements, int errorType)
362 {
363  int i, j;
364  AlgorithmRun * runJ0, * runJ1;
365 
366  Log::instance()->info("Sorting list %d of %d elements by index %d.\n", runList, nelements, errorType);
367 
368  // bubble sort
369  // TODO: change to quicksort if this becomes a bottleneck
370  for (i = 0; i < nelements - 1; i++)
371  {
372  for (j = 0; j < nelements - i - 1; j++)
373  {
374  runJ0 = runList[j];
375  runJ1 = runList[j + 1];
376 
377  if (runJ0->getError(errorType) > runJ1->getError(errorType))
378  {
379  // exchange elements j and j + 1
380  runList[j] = runJ1;
381  runList[j + 1] = runJ0;
382  }
383  }
384  }
385 }
386 
387 /****************************************************************/
388 /****************** done ****************************************/
389 
390 int BestSubsets::done() const
391 {
392  return _done;
393 }
394 
395 /****************************************************************/
396 /****************** done ****************************************/
397 
399 {
400  if (done())
401  { return 1.0; }
402 
403  else
404  {
405  float progByTotalRuns = 0.0;
406  float progByHardOmission = 0.0;
407 
408  float avgProgressActiveRuns = 0.0;
409  for (int i = 0; i < _numActiveRuns; i++)
410  { avgProgressActiveRuns += _activeRun[i]->getProgress(); }
411  avgProgressActiveRuns /= _numActiveRuns;
412 
413  progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns;
414 
416  {
417  progByHardOmission = (_currentModelsUnderOmissionThreshold /
418  (float) _modelsUnderOmission);
419  }
420 
421  float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission;
422 
423  if (progress > _maxProgress)
424  { _maxProgress = progress; }
425 
426  return _maxProgress;
427  }
428 }
429 
430 /****************************************************************/
431 /****************** getValue ************************************/
432 
434 {
435  int i;
436  double sum = 0.0;
437 
438  if (_done)
439  {
440  for (i = 0; i < _numBestRuns; i++)
441  { sum += _bestRun[i]->getValue(x); }
442  }
443 
444  return sum / (double) _numBestRuns;
445 }
446 
447 /****************************************************************/
448 /****************** getConvergence ******************************/
449 
450 int BestSubsets::getConvergence( Scalar * const val ) const
451 {
452  *val = 0;
453  return 0;
454 }
455 
456 /****************************************************************/
457 /****************** getConfiguration ****************************/
459 {
460  if ( !_done ) {
461 
462  return;
463  }
464 
465  ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") );
466  config->addSubsection( model_config );
467 
468  model_config->addNameValue("Count", _numBestRuns);
469 
470  for ( int i=0; i < _numBestRuns; i++ ) {
471 
472  ConfigurationPtr child_config( new ConfigurationImpl("Run") );
473  child_config->addNameValue( "Id", i );
474  child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 );
475  child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 );
476 
477  ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration();
478  child_config->addSubsection( alg_config );
479 
480  model_config->addSubsection( child_config );
481  }
482 }
483 
484 /***************************************************************/
485 /****************** setConfiguration ***************************/
487 {
488  ConstConfigurationPtr model_config = config->getSubsection( "BestSubsets", false );
489 
490  if ( ! model_config ) {
491 
492  return;
493  }
494 
495  _done = true;
496 
497  _numBestRuns = model_config->getAttributeAsInt( "Count", 0 );
498 
500 
501  Configuration::subsection_list runs = model_config->getAllSubsections();
502 
503  Configuration::subsection_list::const_iterator fin = runs.end();
504  Configuration::subsection_list::const_iterator it = runs.begin();
505  // The index i is used to populate the _bestRuns array it is incremented after each
506  // new algorithm is found.
507  int i;
508  for ( i = 0; it != fin; ++it ) {
509 
510  // Test this here rather than at the bottom of loop.
511  // This needs to be done after checking for loop terminal condition.
512  if ( i == _numBestRuns ) {
513 
514  throw ConfigurationException( "Number of deserialized algorithms exceeds Count" );
515  }
516 
517  ConstConfigurationPtr run_config = *it;
518 
519  if ( run_config->getName() != "Run" ) {
520 
521  continue;
522  }
523 
524  AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") );
525 
526  _bestRun[i] = new AlgorithmRun( alg );
527 
528  // increment i after adding algorithmRun to _bestRun
529  ++i;
530  }
531 
532  if ( i < _numBestRuns ) {
533 
534  throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" );
535  }
536 }
int _commissionSampleSize
AlgorithmPtr getAlgorithm()
Definition: AlgorithmRun.hh:73
void warn(const char *format,...)
'Warn' level.
Definition: Log.cpp:273
int done() const
std::vector< ConfigurationPtr > subsection_list
void sortRuns(AlgorithmRun **runList, int nelements, int errorType)
static AlgorithmPtr newAlgorithm(std::string const id)
float getProgress() const
double _commissionThreshold
double Scalar
Type of map values.
Definition: om_defs.hh:39
AlgorithmRun ** _bestRun
Scalar getValue(const Sample &x) const
virtual ~BestSubsets()
float getProgress() const
void _getConfiguration(ConfigurationPtr &) const
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
#define SLEEP(secs)
virtual int transferParametersToAlgorithm()=0
void error(const char *format,...)
'Error' level.
Definition: Log.cpp:290
void printListOfRuns(string msg, AlgorithmRun **runs, int numOfRuns)
double getOmission() const
AlgorithmRun ** _activeRun
int _modelsUnderOmission
int getParameter(std::string const &name, std::string *value)
float _maxProgress
int _currentModelsUnderOmissionThreshold
int earlyTerminationConditionMet()
double getError(int type) const
void splitSampler(const SamplerPtr &orig, SamplerPtr *train, SamplerPtr *test, double propTrain)
Definition: Sampler.cpp:1171
bool _softOmissionThreshold
int calculateBestSubset()
double _trainProp
BestSubsets(AlgMetadata *metadata)
void _setConfiguration(const ConstConfigurationPtr &)
double getCommission() const
AlgorithmRun ** _finishedRun
int getId() const
Definition: AlgorithmRun.hh:63
SamplerPtr _samp
Definition: Algorithm.hh:245
void info(const char *format,...)
'Info' level.
Definition: Log.cpp:256
int numActiveThreads()
Scalar getValue(const Sample &x) const
double _omissionThreshold
AlgParameter * _alg_params
int _numFinishedRuns
int getConvergence(Scalar *const val) const
void debug(const char *format,...)
'Debug' level.
Definition: Log.cpp:237
AlgMetadata metadata
Definition: garp.cpp:134
bool running() const
Definition: Sample.hh:25
int initialize(int id, int comm_samples, const SamplerPtr &train_sampler, const SamplerPtr &test_sampler)