doxygen/1.4.0/best__subsets_8cpp_source.html

00001
00030 #include "best_subsets.hh"
00031
00032 #include <openmodeller/om.hh>
00033 #include <openmodeller/Random.hh>
00034 #include <openmodeller/Sampler.hh>
00035 #include <openmodeller/Exceptions.hh>
00036
00037 #include <stdlib.h>
00038 #include <stdio.h>
00039 #include <math.h>
00040 #include <string>
00041 using std::string;
00042
00043 #ifdef WIN32
00044 #include <windows.h>
00045 #define SLEEP(secs) Sleep(secs * 1000)
00046 #else
00047 #include <unistd.h>
00048 #define SLEEP(secs) sleep(secs);
00049 #endif
00050
00051
00052 /****************************************************************/
00053 void BestSubsets::printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns)
00054 {
00055   printf("%s\n", msg.c_str());
00056   for (int i = 0; i < numOfRuns; i++)
00057     printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i,
00058         runs[i]->getOmission(),
00059         runs[i]->getCommission(),
00060         runs[i]->getId());
00061 }
00062
00063 /****************************************************************/
00064 /****************** Garp class **********************************/
00065
00066   BestSubsets::BestSubsets(AlgMetadata * metadata)
00067 : AlgorithmImpl(metadata)
00068 {
00069   _trainProp = 0.0;
00070   _totalRuns = 0;
00071   _omissionThreshold = 0.0;
00072   _modelsUnderOmission = 0;
00073   _commissionThreshold = 0.0;
00074   _commissionSampleSize = 0;
00075
00076   _softOmissionThreshold = false;
00077   _currentModelsUnderOmissionThreshold = 0;
00078
00079   _finishedRun = NULL;
00080   _activeRun = NULL;
00081   _bestRun = NULL;
00082
00083   _nparam = 0;
00084   _alg_params = NULL;
00085
00086   _numFinishedRuns = 0;
00087   _numActiveRuns = 0;
00088   _done = false;
00089
00090   _maxProgress = 0.0;
00091
00092 }
00093
00094 // ****************************************************************
00095 BestSubsets::~BestSubsets()
00096 {
00097   int i;
00098
00099   if (_finishedRun)
00100   {
00101     for (i = 0; i < _numFinishedRuns; i++)
00102     { delete _finishedRun[i]; }
00103     delete[] _finishedRun;
00104   }
00105
00106   if (_activeRun)
00107   { delete[] _activeRun; }
00108
00109   // bestRun just point to objects referenced by _finishedRun object
00110   if (_bestRun)
00111   { delete[] _bestRun; }
00112
00113   if (_alg_params)
00114     delete[] _alg_params;
00115 }
00116
00117 // ****************************************************************
00118 // ************* initialize ***************************************
00119
00120 int BestSubsets::initialize()
00121 {
00122   // BS parameters
00123   if (!getParameter("TrainingProportion", &_trainProp)) {
00124     Log::instance()->error("Parameter TrainingProportion not set properly.\n");
00125     return 0;
00126   }
00127
00128   if (!getParameter("TotalRuns", &_totalRuns)) {
00129     Log::instance()->error("Parameter TotalRuns not set properly.\n");
00130     return 0;
00131   }
00132
00133   if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) {
00134     Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n");
00135     return 0;
00136   }
00137
00138   if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) {
00139     Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n");
00140     return 0;
00141   }
00142
00143   if (!getParameter("CommissionThreshold", &_commissionThreshold)) {
00144     Log::instance()->error("Parameter CommissionThreshold not set properly.\n");
00145     return 0;
00146   }
00147
00148   if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) {
00149     Log::instance()->error("Parameter CommissionSampleSize not set properly.\n");
00150     return 0;
00151   }
00152
00153   if (!getParameter("MaxThreads", &_maxThreads)) {
00154     Log::instance()->error("Parameter MaxThreads not set properly.\n");
00155     return 0;
00156   }
00157
00158   if ( _maxThreads < 1 )
00159   {
00160     _maxThreads = 1;
00161   }
00162   else if ( _maxThreads > 1 )
00163   {
00164     // When maxThreads is greater than 1, if the machine has only one processor om
00165     // can crash. If the machine has more than one processor GDAL can output lots
00166     // of IO errors (current GDAL version does not seem to be thread safe).
00167     Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n");
00168   }
00169
00170   if (_trainProp <= 1.0)
00171   {
00172     Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n");
00173   }
00174
00175   // convert percentages (100%) to proportions (1.0) for external parameters
00176   _trainProp /= 100.0;
00177   _commissionThreshold /= 100.0;
00178   _omissionThreshold /= 100.0;
00179
00180   _softOmissionThreshold = (_omissionThreshold >= 1.0);
00181   if (_modelsUnderOmission > _totalRuns)
00182   {
00183     Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns);
00184     _modelsUnderOmission = _totalRuns;
00185   }
00186
00187   _finishedRun = new AlgorithmRun*[_totalRuns];
00188   _activeRun = new AlgorithmRun*[_maxThreads];
00189
00190   transferParametersToAlgorithm();
00191
00192   return 1;
00193 }
00194
00195 /****************************************************************/
00196 /****************** iterate *************************************/
00197
00198 int BestSubsets::iterate()
00199 {
00200   static int iterations = 0;
00201   int active;
00202   AlgorithmRun * algRun;
00203
00204   ++iterations;
00205
00206   if (_done)
00207   { return 1; }
00208
00209   // check if it should start new runs
00210   if ((_numFinishedRuns + _numActiveRuns < _totalRuns) &&
00211       !earlyTerminationConditionMet())
00212   {
00213     // it needs to start more runs
00214     // wait for a slot for a new thread
00215     if ((active = numActiveThreads()) >= _maxThreads)
00216     {
00217       //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads);
00218       SLEEP(2);
00219     }
00220
00221     else
00222     {
00223       int runId = _numFinishedRuns + _numActiveRuns;
00224
00225       //Log::instance()->info("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId);
00226
00227       // start new Algorithm
00228       SamplerPtr train, test;
00229       splitSampler( _samp, &train, &test, _trainProp );
00230
00231       Log::instance()->debug( "Presences: orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence() );
00232       Log::instance()->debug( "Absences:  orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence() );
00233
00234       algRun = new AlgorithmRun();
00235       algRun->initialize(runId,
00236           _commissionSampleSize, train, test,
00237           _nparam, _alg_params,
00238           (BSAlgorithmFactory *) this);
00239       _activeRun[_numActiveRuns++] = algRun;
00240       algRun->run();
00241     }
00242   }
00243
00244   else
00245   {
00246     // no more runs are needed
00247     // check if all active threads have finished
00248     active = numActiveThreads();
00249       if (active)
00250       {
00251         // there are still threads running
00252         /*
00253            Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n",
00254            iterations, active);
00255            */
00256         SLEEP(2);
00257       }
00258
00259       else
00260       {
00261         // all running threads terminated
00262         // calculate best subset and exit
00263         //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", i);
00264         calculateBestSubset();
00265         _done = true;
00266       }
00267   }
00268
00269   return 1;
00270 }
00271
00272 /****************************************************************/
00273 int BestSubsets::numActiveThreads()
00274 {
00275   int i;
00276   AlgorithmRun * run;
00277
00278   for (i = 0; i < _numActiveRuns; i++)
00279   {
00280     run = _activeRun[i];
00281
00282     if (!run->running())
00283     {
00284       //Log::instance()->info("Thread %d has just finished.\n", run->getId());
00285
00286       // run finished its work
00287       // move it to finished runs
00288       // and remove it from list of active runs
00289       _finishedRun[_numFinishedRuns++] = run;
00290       _activeRun[i] = _activeRun[--_numActiveRuns];
00291       _activeRun[_numActiveRuns] = NULL;
00292
00293       // update count of models under omission threshold
00294       if (!_softOmissionThreshold)
00295       {
00296         if (run->getOmission() <= _omissionThreshold)
00297         { _currentModelsUnderOmissionThreshold++; }
00298       }
00299     }
00300   }
00301
00302   return _numActiveRuns;
00303 }
00304
00305 /****************************************************************/
00306 int BestSubsets::earlyTerminationConditionMet()
00307 {
00308   return (!_softOmissionThreshold) &&
00309     (_currentModelsUnderOmissionThreshold >= _modelsUnderOmission);
00310 }
00311
00312 /****************************************************************/
00313 int BestSubsets::calculateBestSubset()
00314 {
00315   int i;
00316
00317   Log::instance()->info("Calculating best subset of models.\n");
00318
00319   // make a copy of finished runs to play with
00320   AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns];
00321   for (i = 0; i < _numFinishedRuns; i++)
00322   { runList[i] = _finishedRun[i]; }
00323
00324   printListOfRuns("Finished Runs:", runList, _numFinishedRuns);
00325
00326   // get list of models that pass omission test
00327   // sort runs by omission
00328   // first <_modelsUnderOmission> runs are the selected ones
00329   sortRuns(runList, _numFinishedRuns, 0);
00330
00331   printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns);
00332
00333   // get list of models that pass commission test
00334   sortRuns(runList, _modelsUnderOmission, 1);
00335
00336   printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns);
00337
00338   _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 );
00339   int medianRun = _modelsUnderOmission / 2;
00340   int firstRun = (int)ceil( (double)medianRun - (double)_numBestRuns / 2.0 );
00341
00342   _bestRun = new AlgorithmRun*[_numBestRuns];
00343
00344   for (i = 0; i < _numBestRuns; i++)
00345   { _bestRun[i] = runList[i + firstRun]; }
00346
00347   printListOfRuns("Best Runs:", _bestRun, _numBestRuns);
00348
00349   printf("Median: %d First: %d\n", medianRun, firstRun);
00350
00351
00352   delete[] runList;
00353
00354   Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns);
00355
00356   return 1;
00357 }
00358
00359 /****************************************************************/
00360 void BestSubsets::sortRuns(AlgorithmRun ** runList,
00361     int nelements, int errorType)
00362 {
00363   int i, j;
00364   AlgorithmRun * runJ0, * runJ1;
00365
00366   Log::instance()->info("Sorting list %d of %d elements by index %d.\n", runList, nelements, errorType);
00367
00368   // bubble sort
00369   // TODO: change to quicksort if this becomes a bottleneck
00370   for (i = 0; i < nelements - 1; i++)
00371   {
00372     for (j = 0; j < nelements - i - 1; j++)
00373     {
00374       runJ0 = runList[j];
00375       runJ1 = runList[j + 1];
00376
00377       if (runJ0->getError(errorType) > runJ1->getError(errorType))
00378       {
00379         // exchange elements j and j + 1
00380         runList[j] = runJ1;
00381         runList[j + 1] = runJ0;
00382       }
00383     }
00384   }
00385 }
00386
00387 /****************************************************************/
00388 /****************** done ****************************************/
00389
00390 int BestSubsets::done() const
00391 {
00392   return _done;
00393 }
00394
00395 /****************************************************************/
00396 /****************** done ****************************************/
00397
00398 float BestSubsets::getProgress() const
00399 {
00400   if (done())
00401   { return 1.0; }
00402
00403   else
00404   {
00405     float progByTotalRuns = 0.0;
00406     float progByHardOmission = 0.0;
00407
00408     float avgProgressActiveRuns = 0.0;
00409     for (int i = 0; i < _numActiveRuns; i++)
00410     { avgProgressActiveRuns += _activeRun[i]->getProgress(); }
00411     avgProgressActiveRuns /= _numActiveRuns;
00412
00413     progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns;
00414
00415     if (!_softOmissionThreshold)
00416     {
00417       progByHardOmission = (_currentModelsUnderOmissionThreshold /
00418           (float) _modelsUnderOmission);
00419     }
00420
00421     float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission;
00422
00423     if (progress > _maxProgress)
00424     { _maxProgress = progress; }
00425
00426     return _maxProgress;
00427   }
00428 }
00429
00430 /****************************************************************/
00431 /****************** getValue ************************************/
00432
00433 Scalar BestSubsets::getValue( const Sample& x ) const
00434 {
00435   int i;
00436   double sum = 0.0;
00437
00438   if (_done)
00439   {
00440     for (i = 0; i < _numBestRuns; i++)
00441     { sum += _bestRun[i]->getValue(x); }
00442   }
00443
00444   return sum / (double) _numBestRuns;
00445 }
00446
00447 /****************************************************************/
00448 /****************** getConvergence ******************************/
00449
00450 int BestSubsets::getConvergence( Scalar * const val ) const
00451 {
00452   *val = 0;
00453   return 0;
00454 }
00455
00456 /****************************************************************/
00457 /****************** getConfiguration ****************************/
00458 void BestSubsets::_getConfiguration( ConfigurationPtr& config ) const
00459 {
00460   if ( !_done ) {
00461
00462     return;
00463   }
00464
00465   ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") );
00466   config->addSubsection( model_config );
00467
00468   model_config->addNameValue("Count", _numBestRuns);
00469
00470   for ( int i=0; i < _numBestRuns; i++ ) {
00471
00472     ConfigurationPtr child_config( new ConfigurationImpl("Run") );
00473     child_config->addNameValue( "Id", i );
00474     child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 );
00475     child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 );
00476
00477     ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration();
00478     child_config->addSubsection( alg_config );
00479
00480     model_config->addSubsection( child_config );
00481   }
00482 }
00483
00484 /***************************************************************/
00485 /****************** setConfiguration ***************************/
00486 void BestSubsets::_setConfiguration( const ConstConfigurationPtr& config )
00487 {
00488   ConstConfigurationPtr model_config = config->getSubsection( "BestSubsets", false );
00489
00490   if ( ! model_config ) {
00491
00492     return;
00493   }
00494
00495   _done = true;
00496
00497   _numBestRuns = model_config->getAttributeAsInt( "Count", 0 );
00498
00499   _bestRun = new AlgorithmRun*[_numBestRuns];
00500
00501   Configuration::subsection_list runs = model_config->getAllSubsections();
00502
00503   Configuration::subsection_list::const_iterator fin = runs.end();
00504   Configuration::subsection_list::const_iterator it = runs.begin();
00505   // The index i is used to populate the _bestRuns array it is incremented after each
00506   // new algorithm is found.
00507   int i;
00508   for ( i = 0; it != fin; ++it ) {
00509
00510     // Test this here rather than at the bottom of loop.
00511     // This needs to be done after checking for loop terminal condition.
00512     if ( i == _numBestRuns ) {
00513
00514       throw ConfigurationException( "Number of deserialized algorithms exceeds Count" );
00515     }
00516
00517     ConstConfigurationPtr run_config = *it;
00518
00519     if ( run_config->getName() != "Run" ) {
00520
00521       continue;
00522     }
00523
00524     AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") );
00525
00526     _bestRun[i] = new AlgorithmRun( alg );
00527
00528     // increment i after adding algorithmRun to _bestRun
00529     ++i;
00530   }
00531
00532   if ( i < _numBestRuns ) {
00533
00534     throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" );
00535   }
00536 }