openModeller  Version 1.4.0
best_subsets.cpp
Go to the documentation of this file.
00001 
00030 #include "best_subsets.hh"
00031 
00032 #include <openmodeller/om.hh>
00033 #include <openmodeller/Random.hh>
00034 #include <openmodeller/Sampler.hh>
00035 #include <openmodeller/Exceptions.hh>
00036 
00037 #include <stdlib.h>
00038 #include <stdio.h>
00039 #include <math.h>
00040 #include <string>
00041 using std::string;
00042 
00043 #ifdef WIN32
00044 #include <windows.h>
00045 #define SLEEP(secs) Sleep(secs * 1000)
00046 #else
00047 #include <unistd.h>
00048 #define SLEEP(secs) sleep(secs);
00049 #endif
00050 
00051 
00052 /****************************************************************/
00053 void BestSubsets::printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns)
00054 {
00055   printf("%s\n", msg.c_str());
00056   for (int i = 0; i < numOfRuns; i++)
00057     printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i, 
00058         runs[i]->getOmission(), 
00059         runs[i]->getCommission(),
00060         runs[i]->getId());
00061 }
00062 
00063 /****************************************************************/
00064 /****************** Garp class **********************************/
00065 
00066   BestSubsets::BestSubsets(AlgMetadata * metadata)
00067 : AlgorithmImpl(metadata)
00068 {
00069   _trainProp = 0.0;
00070   _totalRuns = 0;
00071   _omissionThreshold = 0.0;
00072   _modelsUnderOmission = 0;
00073   _commissionThreshold = 0.0;
00074   _commissionSampleSize = 0;   
00075 
00076   _softOmissionThreshold = false;
00077   _currentModelsUnderOmissionThreshold = 0;
00078 
00079   _finishedRun = NULL;
00080   _activeRun = NULL;
00081   _bestRun = NULL;
00082 
00083   _nparam = 0;
00084   _alg_params = NULL;
00085 
00086   _numFinishedRuns = 0;
00087   _numActiveRuns = 0;
00088   _done = false;
00089 
00090   _maxProgress = 0.0;
00091 
00092 }
00093 
00094 // ****************************************************************
00095 BestSubsets::~BestSubsets()
00096 {
00097   int i;
00098 
00099   if (_finishedRun)
00100   {
00101     for (i = 0; i < _numFinishedRuns; i++)
00102     { delete _finishedRun[i]; }
00103     delete[] _finishedRun;
00104   }
00105 
00106   if (_activeRun)
00107   { delete[] _activeRun; }
00108 
00109   // bestRun just point to objects referenced by _finishedRun object
00110   if (_bestRun)
00111   { delete[] _bestRun; }
00112 
00113   if (_alg_params)
00114     delete[] _alg_params;
00115 }
00116 
00117 // ****************************************************************
00118 // ************* initialize ***************************************
00119 
00120 int BestSubsets::initialize()
00121 {
00122   // BS parameters
00123   if (!getParameter("TrainingProportion", &_trainProp)) {
00124     Log::instance()->error("Parameter TrainingProportion not set properly.\n");
00125     return 0;
00126   }
00127 
00128   if (!getParameter("TotalRuns", &_totalRuns)) {
00129     Log::instance()->error("Parameter TotalRuns not set properly.\n");
00130     return 0;
00131   }
00132 
00133   if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) {
00134     Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n");
00135     return 0;
00136   }
00137 
00138   if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) {
00139     Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n");
00140     return 0;
00141   }
00142 
00143   if (!getParameter("CommissionThreshold", &_commissionThreshold)) {
00144     Log::instance()->error("Parameter CommissionThreshold not set properly.\n");
00145     return 0;
00146   }
00147 
00148   if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) {
00149     Log::instance()->error("Parameter CommissionSampleSize not set properly.\n");
00150     return 0;
00151   }
00152 
00153   if (!getParameter("MaxThreads", &_maxThreads)) {
00154     Log::instance()->error("Parameter MaxThreads not set properly.\n");
00155     return 0;
00156   }
00157 
00158   if ( _maxThreads < 1 )
00159   {
00160     _maxThreads = 1;
00161   }
00162   else if ( _maxThreads > 1 )
00163   {
00164     // When maxThreads is greater than 1, if the machine has only one processor om
00165     // can crash. If the machine has more than one processor GDAL can output lots
00166     // of IO errors (current GDAL version does not seem to be thread safe).
00167     Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n");
00168   }
00169 
00170   if (_trainProp <= 1.0)
00171   {
00172     Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n");
00173   }
00174 
00175   // convert percentages (100%) to proportions (1.0) for external parameters 
00176   _trainProp /= 100.0;
00177   _commissionThreshold /= 100.0;
00178   _omissionThreshold /= 100.0;
00179 
00180   _softOmissionThreshold = (_omissionThreshold >= 1.0);
00181   if (_modelsUnderOmission > _totalRuns)
00182   {
00183     Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns);
00184     _modelsUnderOmission = _totalRuns;
00185   }
00186 
00187   _finishedRun = new AlgorithmRun*[_totalRuns];
00188   _activeRun = new AlgorithmRun*[_maxThreads];
00189 
00190   transferParametersToAlgorithm();
00191 
00192   return 1;
00193 }
00194 
00195 /****************************************************************/
00196 /****************** iterate *************************************/
00197 
00198 int BestSubsets::iterate()
00199 {
00200   static int iterations = 0;
00201   int active;
00202   AlgorithmRun * algRun;
00203 
00204   ++iterations;
00205 
00206   if (_done)
00207   { return 1; }
00208 
00209   // check if it should start new runs
00210   if ((_numFinishedRuns + _numActiveRuns < _totalRuns) && 
00211       !earlyTerminationConditionMet())
00212   {
00213     // it needs to start more runs
00214     // wait for a slot for a new thread
00215     if ((active = numActiveThreads()) >= _maxThreads)
00216     {
00217       //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads);
00218       SLEEP(2); 
00219     }
00220 
00221     else
00222     {
00223       int runId = _numFinishedRuns + _numActiveRuns;
00224 
00225       //Log::instance()->info("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId);
00226 
00227       // start new Algorithm
00228       SamplerPtr train, test;
00229       splitSampler( _samp, &train, &test, _trainProp );
00230 
00231       Log::instance()->debug( "Presences: orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence() );
00232       Log::instance()->debug( "Absences:  orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence() );
00233 
00234       algRun = new AlgorithmRun();
00235       algRun->initialize(runId,
00236           _commissionSampleSize, train, test, 
00237           _nparam, _alg_params, 
00238           (BSAlgorithmFactory *) this);
00239       _activeRun[_numActiveRuns++] = algRun;
00240       algRun->run();
00241     }
00242   }
00243 
00244   else
00245   {
00246     // no more runs are needed
00247     // check if all active threads have finished
00248     active = numActiveThreads();
00249       if (active)
00250       {
00251         // there are still threads running
00252         /*
00253            Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n", 
00254            iterations, active);
00255            */
00256         SLEEP(2); 
00257       }
00258 
00259       else
00260       {
00261         // all running threads terminated
00262         // calculate best subset and exit
00263         //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", i);
00264         calculateBestSubset();
00265         _done = true;
00266       }
00267   }
00268 
00269   return 1;
00270 }
00271 
00272 /****************************************************************/
00273 int BestSubsets::numActiveThreads()
00274 {
00275   int i;
00276   AlgorithmRun * run;
00277 
00278   for (i = 0; i < _numActiveRuns; i++)
00279   {
00280     run = _activeRun[i];
00281 
00282     if (!run->running())
00283     {
00284       //Log::instance()->info("Thread %d has just finished.\n", run->getId());
00285 
00286       // run finished its work
00287       // move it to finished runs
00288       // and remove it from list of active runs
00289       _finishedRun[_numFinishedRuns++] = run;
00290       _activeRun[i] = _activeRun[--_numActiveRuns];
00291       _activeRun[_numActiveRuns] = NULL;
00292 
00293       // update count of models under omission threshold
00294       if (!_softOmissionThreshold)
00295       {
00296         if (run->getOmission() <= _omissionThreshold)
00297         { _currentModelsUnderOmissionThreshold++; }
00298       }
00299     }
00300   }
00301 
00302   return _numActiveRuns;
00303 }
00304 
00305 /****************************************************************/
00306 int BestSubsets::earlyTerminationConditionMet()
00307 {
00308   return (!_softOmissionThreshold) &&
00309     (_currentModelsUnderOmissionThreshold >= _modelsUnderOmission);
00310 }
00311 
00312 /****************************************************************/
00313 int BestSubsets::calculateBestSubset()
00314 {
00315   int i;
00316 
00317   Log::instance()->info("Calculating best subset of models.\n");
00318 
00319   // make a copy of finished runs to play with
00320   AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns];
00321   for (i = 0; i < _numFinishedRuns; i++)
00322   { runList[i] = _finishedRun[i]; }
00323 
00324   printListOfRuns("Finished Runs:", runList, _numFinishedRuns);
00325 
00326   // get list of models that pass omission test
00327   // sort runs by omission
00328   // first <_modelsUnderOmission> runs are the selected ones
00329   sortRuns(runList, _numFinishedRuns, 0);
00330 
00331   printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns);
00332 
00333   // get list of models that pass commission test
00334   sortRuns(runList, _modelsUnderOmission, 1);
00335 
00336   printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns);
00337 
00338   _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 );
00339   int medianRun = _modelsUnderOmission / 2;
00340   int firstRun = (int)ceil( (double)medianRun - (double)_numBestRuns / 2.0 );
00341 
00342   _bestRun = new AlgorithmRun*[_numBestRuns];
00343 
00344   for (i = 0; i < _numBestRuns; i++)
00345   { _bestRun[i] = runList[i + firstRun]; }
00346 
00347   printListOfRuns("Best Runs:", _bestRun, _numBestRuns);
00348 
00349   printf("Median: %d First: %d\n", medianRun, firstRun);
00350 
00351 
00352   delete[] runList;
00353 
00354   Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns);
00355 
00356   return 1;
00357 }
00358 
00359 /****************************************************************/
00360 void BestSubsets::sortRuns(AlgorithmRun ** runList, 
00361     int nelements, int errorType)
00362 {
00363   int i, j;
00364   AlgorithmRun * runJ0, * runJ1;
00365 
00366   Log::instance()->info("Sorting list %d of %d elements by index %d.\n", runList, nelements, errorType);
00367 
00368   // bubble sort
00369   // TODO: change to quicksort if this becomes a bottleneck
00370   for (i = 0; i < nelements - 1; i++)
00371   { 
00372     for (j = 0; j < nelements - i - 1; j++)
00373     {
00374       runJ0 = runList[j];
00375       runJ1 = runList[j + 1];
00376 
00377       if (runJ0->getError(errorType) > runJ1->getError(errorType))
00378       {
00379         // exchange elements j and j + 1
00380         runList[j] = runJ1;
00381         runList[j + 1] = runJ0;
00382       }
00383     }
00384   }
00385 }
00386 
00387 /****************************************************************/
00388 /****************** done ****************************************/
00389 
00390 int BestSubsets::done() const
00391 {
00392   return _done;
00393 }
00394 
00395 /****************************************************************/
00396 /****************** done ****************************************/
00397 
00398 float BestSubsets::getProgress() const
00399 {
00400   if (done())
00401   { return 1.0; } 
00402 
00403   else
00404   {
00405     float progByTotalRuns = 0.0;
00406     float progByHardOmission = 0.0;
00407 
00408     float avgProgressActiveRuns = 0.0;
00409     for (int i = 0; i < _numActiveRuns; i++)
00410     { avgProgressActiveRuns += _activeRun[i]->getProgress(); }
00411     avgProgressActiveRuns /= _numActiveRuns;
00412 
00413     progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns;
00414 
00415     if (!_softOmissionThreshold)
00416     {
00417       progByHardOmission = (_currentModelsUnderOmissionThreshold / 
00418           (float) _modelsUnderOmission); 
00419     }
00420 
00421     float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission;
00422 
00423     if (progress > _maxProgress)
00424     { _maxProgress = progress; }
00425 
00426     return _maxProgress;
00427   }
00428 }
00429 
00430 /****************************************************************/
00431 /****************** getValue ************************************/
00432 
00433 Scalar BestSubsets::getValue( const Sample& x ) const
00434 {
00435   int i;
00436   double sum = 0.0;
00437 
00438   if (_done)
00439   {
00440     for (i = 0; i < _numBestRuns; i++)
00441     { sum += _bestRun[i]->getValue(x); }
00442   }
00443 
00444   return sum / (double) _numBestRuns;
00445 }
00446 
00447 /****************************************************************/
00448 /****************** getConvergence ******************************/
00449 
00450 int BestSubsets::getConvergence( Scalar * const val ) const
00451 {
00452   *val = 0;
00453   return 0;
00454 }
00455 
00456 /****************************************************************/
00457 /****************** getConfiguration ****************************/
00458 void BestSubsets::_getConfiguration( ConfigurationPtr& config ) const
00459 {
00460   if ( !_done ) {
00461 
00462     return;
00463   }
00464 
00465   ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") );
00466   config->addSubsection( model_config );
00467 
00468   model_config->addNameValue("Count", _numBestRuns);
00469 
00470   for ( int i=0; i < _numBestRuns; i++ ) {
00471 
00472     ConfigurationPtr child_config( new ConfigurationImpl("Run") );
00473     child_config->addNameValue( "Id", i );
00474     child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 );
00475     child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 );
00476 
00477     ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration();
00478     child_config->addSubsection( alg_config );
00479 
00480     model_config->addSubsection( child_config );
00481   }
00482 }
00483 
00484 /***************************************************************/
00485 /****************** setConfiguration ***************************/
00486 void BestSubsets::_setConfiguration( const ConstConfigurationPtr& config )
00487 {
00488   ConstConfigurationPtr model_config = config->getSubsection( "BestSubsets", false );
00489 
00490   if ( ! model_config ) {
00491 
00492     return;
00493   }
00494 
00495   _done = true;
00496 
00497   _numBestRuns = model_config->getAttributeAsInt( "Count", 0 );
00498 
00499   _bestRun = new AlgorithmRun*[_numBestRuns];
00500 
00501   Configuration::subsection_list runs = model_config->getAllSubsections();
00502 
00503   Configuration::subsection_list::const_iterator fin = runs.end();
00504   Configuration::subsection_list::const_iterator it = runs.begin();
00505   // The index i is used to populate the _bestRuns array it is incremented after each
00506   // new algorithm is found.
00507   int i;
00508   for ( i = 0; it != fin; ++it ) {
00509 
00510     // Test this here rather than at the bottom of loop.
00511     // This needs to be done after checking for loop terminal condition.
00512     if ( i == _numBestRuns ) {
00513 
00514       throw ConfigurationException( "Number of deserialized algorithms exceeds Count" );
00515     }
00516 
00517     ConstConfigurationPtr run_config = *it;
00518 
00519     if ( run_config->getName() != "Run" ) {
00520 
00521       continue;
00522     }
00523 
00524     AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") );
00525 
00526     _bestRun[i] = new AlgorithmRun( alg );
00527 
00528     // increment i after adding algorithmRun to _bestRun
00529     ++i;
00530   }
00531 
00532   if ( i < _numBestRuns ) {
00533 
00534     throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" );
00535   }
00536 }