openModeller  Version 1.5.0
AbstractBestSubsets.cpp
Go to the documentation of this file.
1 
29 #ifdef WIN32
30 // avoid warnings caused by problems in VC headers
31 #define _SCL_SECURE_NO_DEPRECATE
32 #endif
33 
34 #include <string>
35 using std::string;
36 
37 #include "AbstractBestSubsets.hh"
39 
40 #include <math.h> // for function ceil()
41 
42 #ifdef WIN32
43 #include <windows.h>
44 #define SLEEP(secs) Sleep(secs * 1000)
45 #else
46 #include <unistd.h>
47 #define SLEEP(secs) sleep(secs);
48 #endif
49 
50 /****************************************************************/
51 static void printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns)
52 {
53  printf("%s\n", msg.c_str());
54  for (int i = 0; i < numOfRuns; i++)
55  printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i,
56  runs[i]->getOmission(),
57  runs[i]->getCommission(),
58  runs[i]->getId());
59 }
60 
61 /****************************************************************/
62 /****************** Garp class **********************************/
63 
65  AlgorithmImpl( metadata_bs )
66 {
67  _trainProp = 0.0;
68  _totalRuns = 0;
69  _omissionThreshold = 0.0;
73  _maxThreads = 0;
74 
75  _softOmissionThreshold = false;
77 
78  _finishedRun = NULL;
79  _activeRun = NULL;
80  _bestRun = NULL;
81 
82  _numFinishedRuns = 0;
83  _numActiveRuns = 0;
84  _done = false;
85 
86  _maxProgress = 0.0;
87 
88 }
89 
90 // ****************************************************************
92 {
93  int i;
94 
95  if (_finishedRun)
96  {
97  for (i = 0; i < _numFinishedRuns; i++)
98  { delete _finishedRun[i]; }
99  delete[] _finishedRun;
100  }
101 
102  if (_activeRun)
103  { delete[] _activeRun; }
104 
105  // bestRun just point to objects referenced by _finishedRun object
106  if (_bestRun)
107  { delete[] _bestRun; }
108 
109 }
110 
111 /****************************************************************/
112 /****************** configuration *******************************/
113 void
115 {
116 
117  if ( !_done )
118  return;
119 
120  ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") );
121  config->addSubsection( model_config );
122 
123  model_config->addNameValue("Count", _numBestRuns);
124 
125  for( int i=0; i<_numBestRuns; i++ ) {
126  ConfigurationPtr child_config( new ConfigurationImpl("Run" ) );
127  child_config->addNameValue( "Id", i );
128  child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 );
129  child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 );
130 
131  ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration();
132  child_config->addSubsection( alg_config );
133 
134  model_config->addSubsection( child_config );
135 
136  }
137 
138 }
139 
140  void
142 {
143 
144  ConstConfigurationPtr model_config = config->getSubsection("BestSubsets",false);
145  if (!model_config)
146  return;
147 
148  _done = true;
149 
150  _numBestRuns = model_config->getAttributeAsInt( "Count", 0 );
151 
153 
154  Configuration::subsection_list runs = model_config->getAllSubsections();
155 
156  Configuration::subsection_list::const_iterator fin = runs.end();
157  Configuration::subsection_list::const_iterator it = runs.begin();
158  // The index i is used to populate the _bestRuns array it is incremented after each
159  // new algorithm is found.
160  int i;
161  for( i = 0; it != fin; ++it ) {
162 
163  // Test this here rather than at the bottom of loop.
164  // This needs to be done after checking for loop terminal condition.
165  if ( i == _numBestRuns ) {
166  throw ConfigurationException( "Number of deserialized algorithms exceeds Count" );
167  }
168 
169  ConstConfigurationPtr run_config = *it;
170 
171  if ( run_config->getName() != "Run" ) {
172  continue;
173  }
174 
175  AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") );
176 
177  _bestRun[i] = new AlgorithmRun( alg );
178 
179  // increment i after adding algorithmRun to _bestRun
180  ++i;
181  }
182 
183  if ( i < _numBestRuns ) {
184  throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" );
185  }
186 }
187 
188 // ****************************************************************
189 // ************* needNormalization ********************************
190 
192 {
193  // This is a hack. needNormalization is called before initialize.
194  AbstractBestSubsets *non_const = const_cast<AbstractBestSubsets*>(this);
195  if ( _subAlgorithm.empty() ) {
196  if ( !non_const->getParameter("SubAlgorithm", & _subAlgorithm ) ) {
197  std::string error = "Parameter SubAlgorithm not set properly.";
198  Log::instance()->error( error.c_str() );
199  throw AlgorithmException( error );
200  }
201  }
202 
204 
205  if ( alg->needNormalization() ) {
206 
207  Log::instance()->info( "Computing normalization in best subsets\n");
208 
209  // Compute normalization here to avoid computing it again in each GARP run
210  // note: getNormalizer will return a copy
211  _normalizerPtr = alg->getNormalizer();
212 
213  if ( _normalizerPtr ) {
214 
216 
218  }
219  }
220 
221  // No need to normalize again
222  return 0;
223 }
224 
225 // ****************************************************************
226 // ************* initialize ***************************************
227 
229 {
230  // BS parameters
231 #if 0
232  if (!getParameter("SubAlgorithm", &_subAlgorithm)) {
233  Log::instance()->error("Parameter SubAlgorithm not set properly.\n");
234  return 0;
235  }
236 #endif
237 
238  if (!getParameter("TrainingProportion", &_trainProp)) {
239  Log::instance()->error("Parameter TrainingProportion not set properly.\n");
240  return 0;
241  }
242 
243  if (!getParameter("TotalRuns", &_totalRuns)) {
244  Log::instance()->error("Parameter TotalRuns not set properly.\n");
245  return 0;
246  }
247 
248  if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) {
249  Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n");
250  return 0;
251  }
252 
253  if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) {
254  Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n");
255  return 0;
256  }
257 
258  if (!getParameter("CommissionThreshold", &_commissionThreshold)) {
259  Log::instance()->error("Parameter CommissionThreshold not set properly.\n");
260  return 0;
261  }
262 
263  if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) {
264  Log::instance()->error("Parameter CommissionSampleSize not set properly.\n");
265  return 0;
266  }
267 
268  if (!getParameter("MaxThreads", &_maxThreads)) {
269  Log::instance()->error("Parameter MaxThreads not set properly.\n");
270  return 0;
271  }
272 
273  if ( _maxThreads < 1 )
274  {
275  _maxThreads = 1;
276  }
277  else if ( _maxThreads > 1 )
278  {
279  // When maxThreads is greater than 1, if the machine has only one processor om
280  // can crash. If the machine has more than one processor GDAL can output lots
281  // of IO errors (current GDAL version does not seem to be thread safe).
282  Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n");
283  }
284 
285  if (_trainProp <= 1.0)
286  {
287  Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n");
288  }
289 
290  // convert percentages (100%) to proportions (1.0) for external parameters
291  _trainProp /= 100.0;
292  _commissionThreshold /= 100.0;
293  _omissionThreshold /= 100.0;
294 
297  {
298  Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns);
300  }
301 
304 
305  return 1;
306 }
307 
308 /****************************************************************/
309 /****************** iterate *************************************/
310 
312 {
313  static int iterations = 0;
314  static int runId = 0;
315  int active;
316  AlgorithmRun * algRun;
317 
318  ++iterations;
319 
320  if (_done)
321  { return 1; }
322 
323  // check if it should start new runs
326  {
327  // it needs to start more runs
328  // wait for a slot for a new thread
329  if ((active = numActiveThreads()) >= _maxThreads)
330  {
331  //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads);
332  SLEEP(2);
333  }
334 
335  else
336  {
337  //Log::instance()->debug("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId);
338 
339  // start new Algorithm
340  SamplerPtr train, test;
341  splitSampler(_samp, &train, &test, _trainProp);
342 
343  //printf("Presences: Orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence());
344  //printf("Absences: Orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence());
345 
347  algo->setParameters( _param );
348  algo->setSampler(train);
349  algo->initialize();
350  algRun = new AlgorithmRun(algo);
351  algRun->initialize(runId++,
352  _commissionSampleSize, train, test );
353  _activeRun[_numActiveRuns++] = algRun;
354  algRun->run();
355  }
356  }
357 
358  else
359  {
360  // no more runs are needed
361  // check if all active threads have finished
362  active = numActiveThreads();
363  if (active)
364  {
365  // there are still threads running
366  // Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n", iterations, active);
367  SLEEP(2);
368  }
369  else
370  {
371  // all running threads terminated
372  // calculate best subset and exit
373  //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", iterations);
375  _done = true;
376  }
377  }
378 
379  return 1;
380 }
381 
382 /****************************************************************/
384 {
385  int i;
386  AlgorithmRun * run;
387 
388  for (i = 0; i < _numActiveRuns; i++)
389  {
390  run = _activeRun[i];
391 
392  if (!run->running())
393  {
394  //Log::instance()->info("Thread %d has just finished.\n", run->getId());
395 
396  // run finished its work
397  // move it to finished runs
398  // and remove it from list of active runs
401  _activeRun[_numActiveRuns] = NULL;
402 
403  // update count of models under omission threshold
405  {
406  if (run->getOmission() <= _omissionThreshold)
408  }
409  }
410  }
411 
412  return _numActiveRuns;
413 }
414 
415 /****************************************************************/
417 {
418  return (!_softOmissionThreshold) &&
420 }
421 
422 /****************************************************************/
424 {
425  int i;
426 
427  Log::instance()->info("Calculating best subset of models.\n");
428 
429  // make a copy of finished runs to play with
430  AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns];
431  for (i = 0; i < _numFinishedRuns; i++)
432  { runList[i] = _finishedRun[i]; }
433 
434  printListOfRuns("Finished Runs:", runList, _numFinishedRuns);
435 
436  // get list of models that pass omission test
437  // sort runs by omission
438  // first <_modelsUnderOmission> runs are the selected ones
439  sortRuns(runList, _numFinishedRuns, 0);
440 
441  printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns);
442 
443  // get list of models that pass commission test
444  sortRuns(runList, _modelsUnderOmission, 1);
445 
446  printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns);
447 
448  _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 );
449  int medianRun = _modelsUnderOmission / 2;
450  int firstRun = (int)ceil( (double) medianRun - (double)_numBestRuns / 2.0 );
451 
453 
454  for (i = 0; i < _numBestRuns; i++)
455  { _bestRun[i] = runList[i + firstRun]; }
456 
457  printListOfRuns("Best Runs:", _bestRun, _numBestRuns);
458 
459  printf("Median: %d First: %d\n", medianRun, firstRun);
460 
461 
462  delete[] runList;
463 
464  Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns);
465 
466  return 1;
467 }
468 
469 /****************************************************************/
471  int nelements, int errorType)
472 {
473  int i, j;
474  AlgorithmRun * runJ0, * runJ1;
475 
476  //Log::instance()->info("Sorting list %x of %d elements by index %d.\n", runList, nelements, errorType);
477 
478  // bubble sort
479  // TODO: change to quicksort if this becomes a bottleneck
480  for (i = 0; i < nelements - 1; i++)
481  {
482  for (j = 0; j < nelements - i - 1; j++)
483  {
484  runJ0 = runList[j];
485  runJ1 = runList[j + 1];
486 
487  if (runJ0->getError(errorType) > runJ1->getError(errorType))
488  {
489  // exchange elements j and j + 1
490  runList[j] = runJ1;
491  runList[j + 1] = runJ0;
492  }
493  }
494  }
495 }
496 
497 /****************************************************************/
498 /****************** done ****************************************/
499 
501 {
502  return _done;
503 }
504 
505 /****************************************************************/
506 /****************** done ****************************************/
507 
509 {
510  if (done())
511  { return 1.0; }
512 
513  else
514  {
515  float progByTotalRuns = 0.0;
516  float progByHardOmission = 0.0;
517 
518  float avgProgressActiveRuns = 0.0;
519  for (int i = 0; i < _numActiveRuns; i++)
520  { avgProgressActiveRuns += _activeRun[i]->getProgress(); }
521  avgProgressActiveRuns /= _numActiveRuns;
522 
523  progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns;
524 
526  {
527  progByHardOmission = (_currentModelsUnderOmissionThreshold /
528  (float) _modelsUnderOmission);
529  }
530 
531  float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission;
532 
533  if (progress > _maxProgress)
534  { _maxProgress = progress; }
535 
536  return _maxProgress;
537  }
538 }
539 
540 /****************************************************************/
541 /****************** getValue ************************************/
542 
544 {
545  int i;
546  double sum = 0.0;
547 
548  if (_done)
549  {
550  for (i = 0; i < _numBestRuns; i++)
551  { sum += _bestRun[i]->getValue(x); }
552  }
553 
554  return sum / (double) _numBestRuns;
555 }
556 
557 /****************************************************************/
558 /****************** getConvergence ******************************/
559 
561 {
562  *val = 0;
563  return 0;
564 }
565 
AlgorithmPtr getAlgorithm()
Definition: AlgorithmRun.hh:73
void warn(const char *format,...)
'Warn' level.
Definition: Log.cpp:273
std::vector< ConfigurationPtr > subsection_list
static AlgorithmPtr newAlgorithm(std::string const id)
double Scalar
Type of map values.
Definition: om_defs.hh:39
#define SLEEP(secs)
float getProgress() const
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
AlgMetadata metadata_bs
void sortRuns(AlgorithmRun **runList, int nelements, int errorType)
void error(const char *format,...)
'Error' level.
Definition: Log.cpp:290
double getOmission() const
int getParameter(std::string const &name, std::string *value)
void setNormalization(const SamplerPtr &samp) const
Definition: Algorithm.cpp:350
double getError(int type) const
Scalar getValue(const Sample &x) const
void splitSampler(const SamplerPtr &orig, SamplerPtr *train, SamplerPtr *test, double propTrain)
Definition: Sampler.cpp:1171
AlgorithmRun ** _activeRun
static void printListOfRuns(string msg, AlgorithmRun **runs, int numOfRuns)
AbstractBestSubsets(AlgMetadata const *)
double getCommission() const
int getId() const
Definition: AlgorithmRun.hh:63
SamplerPtr _samp
Definition: Algorithm.hh:245
void info(const char *format,...)
'Info' level.
Definition: Log.cpp:256
Scalar getValue(const Sample &x) const
virtual void _setConfiguration(const ConstConfigurationPtr &)
virtual void _getConfiguration(ConfigurationPtr &) const
int getConvergence(Scalar *const val) const
virtual void computeNormalization(const ReferenceCountedPointer< const SamplerImpl > &samplerPtr)=0
ParamSetType _param
Definition: Algorithm.hh:249
Normalizer * _normalizerPtr
Definition: Algorithm.hh:247
bool running() const
Definition: Sample.hh:25
AlgorithmRun ** _finishedRun
static char error[256]
Definition: FileParser.cpp:42
int initialize(int id, int comm_samples, const SamplerPtr &train_sampler, const SamplerPtr &test_sampler)