doxygen/1.4.0/AbstractBestSubsets_8cpp_source.html

00001
00029 #ifdef WIN32
00030 // avoid warnings caused by problems in VC headers
00031 #define _SCL_SECURE_NO_DEPRECATE
00032 #endif
00033
00034 #include <string>
00035 using std::string;
00036
00037 #include "AbstractBestSubsets.hh"
00038 #include <openmodeller/Exceptions.hh>
00039
00040 #include <math.h> // for function ceil()
00041
00042 #ifdef WIN32
00043 #include <windows.h>
00044 #define SLEEP(secs) Sleep(secs * 1000)
00045 #else
00046 #include <unistd.h>
00047 #define SLEEP(secs) sleep(secs);
00048 #endif
00049
00050 /****************************************************************/
00051 static void printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns)
00052 {
00053   printf("%s\n", msg.c_str());
00054   for (int i = 0; i < numOfRuns; i++)
00055     printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i,
00056         runs[i]->getOmission(),
00057         runs[i]->getCommission(),
00058         runs[i]->getId());
00059 }
00060
00061 /****************************************************************/
00062 /****************** Garp class **********************************/
00063
00064 AbstractBestSubsets::AbstractBestSubsets( AlgMetadata const * metadata_bs) :
00065   AlgorithmImpl( metadata_bs )
00066 {
00067   _trainProp = 0.0;
00068   _totalRuns = 0;
00069   _omissionThreshold = 0.0;
00070   _modelsUnderOmission = 0;
00071   _commissionThreshold = 0.0;
00072   _commissionSampleSize = 0;
00073   _maxThreads = 0;
00074
00075   _softOmissionThreshold = false;
00076   _currentModelsUnderOmissionThreshold = 0;
00077
00078   _finishedRun = NULL;
00079   _activeRun = NULL;
00080   _bestRun = NULL;
00081
00082   _numFinishedRuns = 0;
00083   _numActiveRuns = 0;
00084   _done = false;
00085
00086   _maxProgress = 0.0;
00087
00088 }
00089
00090 // ****************************************************************
00091 AbstractBestSubsets::~AbstractBestSubsets()
00092 {
00093   int i;
00094
00095   if (_finishedRun)
00096   {
00097     for (i = 0; i < _numFinishedRuns; i++)
00098     { delete _finishedRun[i]; }
00099     delete[] _finishedRun;
00100   }
00101
00102   if (_activeRun)
00103   { delete[] _activeRun; }
00104
00105   // bestRun just point to objects referenced by _finishedRun object
00106   if (_bestRun)
00107   { delete[] _bestRun; }
00108
00109 }
00110
00111 /****************************************************************/
00112 /****************** configuration *******************************/
00113 void
00114 AbstractBestSubsets::_getConfiguration( ConfigurationPtr& config ) const
00115 {
00116
00117   if ( !_done )
00118     return;
00119
00120   ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") );
00121   config->addSubsection( model_config );
00122
00123   model_config->addNameValue("Count", _numBestRuns);
00124
00125   for( int i=0; i<_numBestRuns; i++ ) {
00126     ConfigurationPtr child_config( new ConfigurationImpl("Run" ) );
00127     child_config->addNameValue( "Id", i );
00128     child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 );
00129     child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 );
00130
00131     ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration();
00132     child_config->addSubsection( alg_config );
00133
00134     model_config->addSubsection( child_config );
00135
00136   }
00137
00138 }
00139
00140   void
00141 AbstractBestSubsets::_setConfiguration( const ConstConfigurationPtr& config )
00142 {
00143
00144   ConstConfigurationPtr model_config = config->getSubsection("BestSubsets",false);
00145   if (!model_config)
00146     return;
00147
00148   _done = true;
00149
00150   _numBestRuns = model_config->getAttributeAsInt( "Count", 0 );
00151
00152   _bestRun = new AlgorithmRun*[_numBestRuns];
00153
00154   Configuration::subsection_list runs = model_config->getAllSubsections();
00155
00156   Configuration::subsection_list::const_iterator fin = runs.end();
00157   Configuration::subsection_list::const_iterator it = runs.begin();
00158   // The index i is used to populate the _bestRuns array it is incremented after each
00159   // new algorithm is found.
00160   int i;
00161   for( i = 0; it != fin; ++it ) {
00162
00163     // Test this here rather than at the bottom of loop.
00164     // This needs to be done after checking for loop terminal condition.
00165     if ( i == _numBestRuns ) {
00166       throw ConfigurationException( "Number of deserialized algorithms exceeds Count" );
00167     }
00168
00169     ConstConfigurationPtr run_config = *it;
00170
00171     if ( run_config->getName() != "Run" ) {
00172       continue;
00173     }
00174
00175     AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") );
00176
00177     _bestRun[i] = new AlgorithmRun( alg );
00178
00179     // increment i after adding algorithmRun to _bestRun
00180     ++i;
00181   }
00182
00183   if ( i < _numBestRuns ) {
00184     throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" );
00185   }
00186 }
00187
00188 // ****************************************************************
00189 // ************* needNormalization ********************************
00190
00191 int AbstractBestSubsets::needNormalization()
00192 {
00193   // This is a hack.  needNormalization is called before initialize.
00194   AbstractBestSubsets *non_const = const_cast<AbstractBestSubsets*>(this);
00195   if ( _subAlgorithm.empty() ) {
00196     if ( !non_const->getParameter("SubAlgorithm", & _subAlgorithm ) ) {
00197       std::string error = "Parameter SubAlgorithm not set properly.";
00198       Log::instance()->error( error.c_str() );
00199       throw AlgorithmException( error );
00200     }
00201   }
00202
00203   AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( _subAlgorithm );
00204
00205   if ( alg->needNormalization() ) {
00206
00207     Log::instance()->info( "Computing normalization in best subsets\n");
00208
00209     // Compute normalization here to avoid computing it again in each GARP run
00210     // note: getNormalizer will return a copy
00211     _normalizerPtr = alg->getNormalizer();
00212
00213     if ( _normalizerPtr ) {
00214
00215       _normalizerPtr->computeNormalization( _samp );
00216
00217       setNormalization( _samp );
00218     }
00219   }
00220
00221   // No need to normalize again
00222   return 0;
00223 }
00224
00225 // ****************************************************************
00226 // ************* initialize ***************************************
00227
00228 int AbstractBestSubsets::initialize()
00229 {
00230   // BS parameters
00231 #if 0
00232   if (!getParameter("SubAlgorithm", &_subAlgorithm)) {
00233     Log::instance()->error("Parameter SubAlgorithm not set properly.\n");
00234     return 0;
00235   }
00236 #endif
00237
00238   if (!getParameter("TrainingProportion", &_trainProp)) {
00239     Log::instance()->error("Parameter TrainingProportion not set properly.\n");
00240     return 0;
00241   }
00242
00243   if (!getParameter("TotalRuns", &_totalRuns)) {
00244     Log::instance()->error("Parameter TotalRuns not set properly.\n");
00245     return 0;
00246   }
00247
00248   if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) {
00249     Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n");
00250     return 0;
00251   }
00252
00253   if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) {
00254     Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n");
00255     return 0;
00256   }
00257
00258   if (!getParameter("CommissionThreshold", &_commissionThreshold)) {
00259     Log::instance()->error("Parameter CommissionThreshold not set properly.\n");
00260     return 0;
00261   }
00262
00263   if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) {
00264     Log::instance()->error("Parameter CommissionSampleSize not set properly.\n");
00265     return 0;
00266   }
00267
00268   if (!getParameter("MaxThreads", &_maxThreads)) {
00269     Log::instance()->error("Parameter MaxThreads not set properly.\n");
00270     return 0;
00271   }
00272
00273   if ( _maxThreads < 1 )
00274   {
00275     _maxThreads = 1;
00276   }
00277   else if ( _maxThreads > 1 )
00278   {
00279     // When maxThreads is greater than 1, if the machine has only one processor om
00280     // can crash. If the machine has more than one processor GDAL can output lots
00281     // of IO errors (current GDAL version does not seem to be thread safe).
00282     Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n");
00283   }
00284
00285   if (_trainProp <= 1.0)
00286   {
00287     Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n");
00288   }
00289
00290   // convert percentages (100%) to proportions (1.0) for external parameters
00291   _trainProp /= 100.0;
00292   _commissionThreshold /= 100.0;
00293   _omissionThreshold /= 100.0;
00294
00295   _softOmissionThreshold = (_omissionThreshold >= 1.0);
00296   if (_modelsUnderOmission > _totalRuns)
00297   {
00298     Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns);
00299     _modelsUnderOmission = _totalRuns;
00300   }
00301
00302   _finishedRun = new AlgorithmRun*[_totalRuns];
00303   _activeRun = new AlgorithmRun*[_maxThreads];
00304
00305   return 1;
00306 }
00307
00308 /****************************************************************/
00309 /****************** iterate *************************************/
00310
00311 int AbstractBestSubsets::iterate()
00312 {
00313   static int iterations = 0;
00314   static int runId = 0;
00315   int active;
00316   AlgorithmRun * algRun;
00317
00318   ++iterations;
00319
00320   if (_done)
00321   { return 1; }
00322
00323   // check if it should start new runs
00324   if ((_numFinishedRuns + _numActiveRuns < _totalRuns) &&
00325       !earlyTerminationConditionMet())
00326   {
00327     // it needs to start more runs
00328     // wait for a slot for a new thread
00329     if ((active = numActiveThreads()) >= _maxThreads)
00330     {
00331       //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads);
00332       SLEEP(2);
00333     }
00334
00335     else
00336     {
00337       //Log::instance()->debug("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId);
00338
00339       // start new Algorithm
00340       SamplerPtr train, test;
00341       splitSampler(_samp, &train, &test, _trainProp);
00342
00343       //printf("Presences: Orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence());
00344       //printf("Absences:  Orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence());
00345
00346       AlgorithmPtr algo = AlgorithmFactory::newAlgorithm( _subAlgorithm );
00347       algo->setParameters( _param );
00348       algo->setSampler(train);
00349       algo->initialize();
00350       algRun = new AlgorithmRun(algo);
00351       algRun->initialize(runId++,
00352           _commissionSampleSize, train, test );
00353       _activeRun[_numActiveRuns++] = algRun;
00354       algRun->run();
00355     }
00356   }
00357
00358   else
00359   {
00360     // no more runs are needed
00361     // check if all active threads have finished
00362     active = numActiveThreads();
00363     if (active)
00364     {
00365       // there are still threads running
00366       //  Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n", iterations, active);
00367       SLEEP(2);
00368     }
00369     else
00370     {
00371       // all running threads terminated
00372       // calculate best subset and exit
00373       //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", iterations);
00374       calculateBestSubset();
00375       _done = true;
00376     }
00377   }
00378
00379   return 1;
00380 }
00381
00382 /****************************************************************/
00383 int AbstractBestSubsets::numActiveThreads()
00384 {
00385   int i;
00386   AlgorithmRun * run;
00387
00388   for (i = 0; i < _numActiveRuns; i++)
00389   {
00390     run = _activeRun[i];
00391
00392     if (!run->running())
00393     {
00394       //Log::instance()->info("Thread %d has just finished.\n", run->getId());
00395
00396       // run finished its work
00397       // move it to finished runs
00398       // and remove it from list of active runs
00399       _finishedRun[_numFinishedRuns++] = run;
00400       _activeRun[i] = _activeRun[--_numActiveRuns];
00401       _activeRun[_numActiveRuns] = NULL;
00402
00403       // update count of models under omission threshold
00404       if (!_softOmissionThreshold)
00405       {
00406         if (run->getOmission() <= _omissionThreshold)
00407         { _currentModelsUnderOmissionThreshold++; }
00408       }
00409     }
00410   }
00411
00412   return _numActiveRuns;
00413 }
00414
00415 /****************************************************************/
00416 int AbstractBestSubsets::earlyTerminationConditionMet()
00417 {
00418   return (!_softOmissionThreshold) &&
00419     (_currentModelsUnderOmissionThreshold >= _modelsUnderOmission);
00420 }
00421
00422 /****************************************************************/
00423 int AbstractBestSubsets::calculateBestSubset()
00424 {
00425   int i;
00426
00427   Log::instance()->info("Calculating best subset of models.\n");
00428
00429   // make a copy of finished runs to play with
00430   AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns];
00431   for (i = 0; i < _numFinishedRuns; i++)
00432   { runList[i] = _finishedRun[i]; }
00433
00434   printListOfRuns("Finished Runs:", runList, _numFinishedRuns);
00435
00436   // get list of models that pass omission test
00437   // sort runs by omission
00438   // first <_modelsUnderOmission> runs are the selected ones
00439   sortRuns(runList, _numFinishedRuns, 0);
00440
00441   printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns);
00442
00443   // get list of models that pass commission test
00444   sortRuns(runList, _modelsUnderOmission, 1);
00445
00446   printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns);
00447
00448   _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 );
00449   int medianRun = _modelsUnderOmission / 2;
00450   int firstRun = (int)ceil( (double) medianRun - (double)_numBestRuns / 2.0 );
00451
00452   _bestRun = new AlgorithmRun*[_numBestRuns];
00453
00454   for (i = 0; i < _numBestRuns; i++)
00455   { _bestRun[i] = runList[i + firstRun]; }
00456
00457   printListOfRuns("Best Runs:", _bestRun, _numBestRuns);
00458
00459   printf("Median: %d First: %d\n", medianRun, firstRun);
00460
00461
00462   delete[] runList;
00463
00464   Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns);
00465
00466   return 1;
00467 }
00468
00469 /****************************************************************/
00470 void AbstractBestSubsets::sortRuns(AlgorithmRun ** runList,
00471     int nelements, int errorType)
00472 {
00473   int i, j;
00474   AlgorithmRun * runJ0, * runJ1;
00475
00476   //Log::instance()->info("Sorting list %x of %d elements by index %d.\n", runList, nelements, errorType);
00477
00478   // bubble sort
00479   // TODO: change to quicksort if this becomes a bottleneck
00480   for (i = 0; i < nelements - 1; i++)
00481   {
00482     for (j = 0; j < nelements - i - 1; j++)
00483     {
00484       runJ0 = runList[j];
00485       runJ1 = runList[j + 1];
00486
00487       if (runJ0->getError(errorType) > runJ1->getError(errorType))
00488       {
00489         // exchange elements j and j + 1
00490         runList[j] = runJ1;
00491         runList[j + 1] = runJ0;
00492       }
00493     }
00494   }
00495 }
00496
00497 /****************************************************************/
00498 /****************** done ****************************************/
00499
00500 int AbstractBestSubsets::done() const
00501 {
00502   return _done;
00503 }
00504
00505 /****************************************************************/
00506 /****************** done ****************************************/
00507
00508 float AbstractBestSubsets::getProgress() const
00509 {
00510   if (done())
00511   { return 1.0; }
00512
00513   else
00514   {
00515     float progByTotalRuns = 0.0;
00516     float progByHardOmission = 0.0;
00517
00518     float avgProgressActiveRuns = 0.0;
00519     for (int i = 0; i < _numActiveRuns; i++)
00520     { avgProgressActiveRuns += _activeRun[i]->getProgress(); }
00521     avgProgressActiveRuns /= _numActiveRuns;
00522
00523     progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns;
00524
00525     if (!_softOmissionThreshold)
00526     {
00527       progByHardOmission = (_currentModelsUnderOmissionThreshold /
00528           (float) _modelsUnderOmission);
00529     }
00530
00531     float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission;
00532
00533     if (progress > _maxProgress)
00534     { _maxProgress = progress; }
00535
00536     return _maxProgress;
00537   }
00538 }
00539
00540 /****************************************************************/
00541 /****************** getValue ************************************/
00542
00543 Scalar AbstractBestSubsets::getValue( const Sample& x ) const
00544 {
00545   int i;
00546   double sum = 0.0;
00547
00548   if (_done)
00549   {
00550     for (i = 0; i < _numBestRuns; i++)
00551     { sum += _bestRun[i]->getValue(x); }
00552   }
00553
00554   return sum / (double) _numBestRuns;
00555 }
00556
00557 /****************************************************************/
00558 /****************** getConvergence ******************************/
00559
00560 int AbstractBestSubsets::getConvergence( Scalar * const val ) const
00561 {
00562   *val = 0;
00563   return 0;
00564 }
00565