openModeller  Version 1.4.0
AbstractBestSubsets.cpp
Go to the documentation of this file.
00001 
00029 #ifdef WIN32
00030 // avoid warnings caused by problems in VC headers
00031 #define _SCL_SECURE_NO_DEPRECATE
00032 #endif
00033 
00034 #include <string>
00035 using std::string;
00036 
00037 #include "AbstractBestSubsets.hh"
00038 #include <openmodeller/Exceptions.hh>
00039 
00040 #include <math.h> // for function ceil()
00041 
00042 #ifdef WIN32
00043 #include <windows.h>
00044 #define SLEEP(secs) Sleep(secs * 1000)
00045 #else
00046 #include <unistd.h>
00047 #define SLEEP(secs) sleep(secs);
00048 #endif
00049 
00050 /****************************************************************/
00051 static void printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns)
00052 {
00053   printf("%s\n", msg.c_str());
00054   for (int i = 0; i < numOfRuns; i++)
00055     printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i, 
00056         runs[i]->getOmission(), 
00057         runs[i]->getCommission(),
00058         runs[i]->getId());
00059 }
00060 
00061 /****************************************************************/
00062 /****************** Garp class **********************************/
00063 
00064 AbstractBestSubsets::AbstractBestSubsets( AlgMetadata const * metadata_bs) :
00065   AlgorithmImpl( metadata_bs )
00066 {
00067   _trainProp = 0.0;
00068   _totalRuns = 0;
00069   _omissionThreshold = 0.0;
00070   _modelsUnderOmission = 0;
00071   _commissionThreshold = 0.0;
00072   _commissionSampleSize = 0;
00073   _maxThreads = 0;
00074 
00075   _softOmissionThreshold = false;
00076   _currentModelsUnderOmissionThreshold = 0;
00077 
00078   _finishedRun = NULL;
00079   _activeRun = NULL;
00080   _bestRun = NULL;
00081 
00082   _numFinishedRuns = 0;
00083   _numActiveRuns = 0;
00084   _done = false;
00085 
00086   _maxProgress = 0.0;
00087 
00088 }
00089 
00090 // ****************************************************************
00091 AbstractBestSubsets::~AbstractBestSubsets()
00092 {
00093   int i;
00094 
00095   if (_finishedRun)
00096   {
00097     for (i = 0; i < _numFinishedRuns; i++)
00098     { delete _finishedRun[i]; }
00099     delete[] _finishedRun;
00100   }
00101 
00102   if (_activeRun)
00103   { delete[] _activeRun; }
00104 
00105   // bestRun just point to objects referenced by _finishedRun object
00106   if (_bestRun)
00107   { delete[] _bestRun; }
00108 
00109 }
00110 
00111 /****************************************************************/
00112 /****************** configuration *******************************/
00113 void
00114 AbstractBestSubsets::_getConfiguration( ConfigurationPtr& config ) const
00115 {
00116 
00117   if ( !_done )
00118     return;
00119 
00120   ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") );
00121   config->addSubsection( model_config );
00122 
00123   model_config->addNameValue("Count", _numBestRuns);
00124 
00125   for( int i=0; i<_numBestRuns; i++ ) {
00126     ConfigurationPtr child_config( new ConfigurationImpl("Run" ) );
00127     child_config->addNameValue( "Id", i );
00128     child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 );
00129     child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 );
00130 
00131     ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration();
00132     child_config->addSubsection( alg_config );
00133 
00134     model_config->addSubsection( child_config );
00135 
00136   }
00137 
00138 }
00139 
00140   void
00141 AbstractBestSubsets::_setConfiguration( const ConstConfigurationPtr& config )
00142 {
00143 
00144   ConstConfigurationPtr model_config = config->getSubsection("BestSubsets",false);
00145   if (!model_config)
00146     return;
00147 
00148   _done = true;
00149 
00150   _numBestRuns = model_config->getAttributeAsInt( "Count", 0 );
00151 
00152   _bestRun = new AlgorithmRun*[_numBestRuns];
00153 
00154   Configuration::subsection_list runs = model_config->getAllSubsections();
00155 
00156   Configuration::subsection_list::const_iterator fin = runs.end();
00157   Configuration::subsection_list::const_iterator it = runs.begin();
00158   // The index i is used to populate the _bestRuns array it is incremented after each
00159   // new algorithm is found.
00160   int i;
00161   for( i = 0; it != fin; ++it ) {
00162 
00163     // Test this here rather than at the bottom of loop.
00164     // This needs to be done after checking for loop terminal condition.
00165     if ( i == _numBestRuns ) {
00166       throw ConfigurationException( "Number of deserialized algorithms exceeds Count" );
00167     }
00168 
00169     ConstConfigurationPtr run_config = *it;
00170 
00171     if ( run_config->getName() != "Run" ) {
00172       continue;
00173     }
00174 
00175     AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") );
00176 
00177     _bestRun[i] = new AlgorithmRun( alg );
00178 
00179     // increment i after adding algorithmRun to _bestRun
00180     ++i;
00181   }
00182 
00183   if ( i < _numBestRuns ) {
00184     throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" );
00185   }
00186 }
00187 
00188 // ****************************************************************
00189 // ************* needNormalization ********************************
00190 
00191 int AbstractBestSubsets::needNormalization()
00192 {
00193   // This is a hack.  needNormalization is called before initialize.
00194   AbstractBestSubsets *non_const = const_cast<AbstractBestSubsets*>(this);
00195   if ( _subAlgorithm.empty() ) {
00196     if ( !non_const->getParameter("SubAlgorithm", & _subAlgorithm ) ) {
00197       std::string error = "Parameter SubAlgorithm not set properly.";
00198       Log::instance()->error( error.c_str() );
00199       throw AlgorithmException( error );
00200     }
00201   }
00202 
00203   AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( _subAlgorithm );
00204 
00205   if ( alg->needNormalization() ) {
00206 
00207     Log::instance()->info( "Computing normalization in best subsets\n");
00208 
00209     // Compute normalization here to avoid computing it again in each GARP run
00210     // note: getNormalizer will return a copy
00211     _normalizerPtr = alg->getNormalizer();
00212 
00213     if ( _normalizerPtr ) {
00214 
00215       _normalizerPtr->computeNormalization( _samp );
00216 
00217       setNormalization( _samp );
00218     }
00219   }
00220  
00221   // No need to normalize again 
00222   return 0;
00223 }
00224 
00225 // ****************************************************************
00226 // ************* initialize ***************************************
00227 
00228 int AbstractBestSubsets::initialize()
00229 {
00230   // BS parameters
00231 #if 0
00232   if (!getParameter("SubAlgorithm", &_subAlgorithm)) {
00233     Log::instance()->error("Parameter SubAlgorithm not set properly.\n");
00234     return 0;
00235   }
00236 #endif
00237 
00238   if (!getParameter("TrainingProportion", &_trainProp)) {
00239     Log::instance()->error("Parameter TrainingProportion not set properly.\n");
00240     return 0;
00241   }
00242 
00243   if (!getParameter("TotalRuns", &_totalRuns)) {
00244     Log::instance()->error("Parameter TotalRuns not set properly.\n");
00245     return 0;
00246   }
00247 
00248   if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) {
00249     Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n");
00250     return 0;
00251   }
00252 
00253   if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) {
00254     Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n");
00255     return 0;
00256   }
00257 
00258   if (!getParameter("CommissionThreshold", &_commissionThreshold)) {
00259     Log::instance()->error("Parameter CommissionThreshold not set properly.\n");
00260     return 0;
00261   }
00262 
00263   if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) {
00264     Log::instance()->error("Parameter CommissionSampleSize not set properly.\n");
00265     return 0;
00266   }
00267 
00268   if (!getParameter("MaxThreads", &_maxThreads)) {
00269     Log::instance()->error("Parameter MaxThreads not set properly.\n");
00270     return 0;
00271   }
00272 
00273   if ( _maxThreads < 1 )
00274   {
00275     _maxThreads = 1;
00276   }
00277   else if ( _maxThreads > 1 )
00278   {
00279     // When maxThreads is greater than 1, if the machine has only one processor om
00280     // can crash. If the machine has more than one processor GDAL can output lots
00281     // of IO errors (current GDAL version does not seem to be thread safe).
00282     Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n");
00283   }
00284 
00285   if (_trainProp <= 1.0)
00286   {
00287     Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n");
00288   }
00289 
00290   // convert percentages (100%) to proportions (1.0) for external parameters 
00291   _trainProp /= 100.0;
00292   _commissionThreshold /= 100.0;
00293   _omissionThreshold /= 100.0;
00294 
00295   _softOmissionThreshold = (_omissionThreshold >= 1.0);
00296   if (_modelsUnderOmission > _totalRuns)
00297   {
00298     Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns);
00299     _modelsUnderOmission = _totalRuns;
00300   }
00301 
00302   _finishedRun = new AlgorithmRun*[_totalRuns];
00303   _activeRun = new AlgorithmRun*[_maxThreads];
00304 
00305   return 1;
00306 }
00307 
00308 /****************************************************************/
00309 /****************** iterate *************************************/
00310 
00311 int AbstractBestSubsets::iterate()
00312 {
00313   static int iterations = 0;
00314   static int runId = 0;
00315   int active;
00316   AlgorithmRun * algRun;
00317 
00318   ++iterations;
00319 
00320   if (_done)
00321   { return 1; }
00322 
00323   // check if it should start new runs
00324   if ((_numFinishedRuns + _numActiveRuns < _totalRuns) && 
00325       !earlyTerminationConditionMet())
00326   {
00327     // it needs to start more runs
00328     // wait for a slot for a new thread
00329     if ((active = numActiveThreads()) >= _maxThreads)
00330     {
00331       //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads);
00332       SLEEP(2); 
00333     }
00334 
00335     else
00336     {
00337       //Log::instance()->debug("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId);
00338 
00339       // start new Algorithm
00340       SamplerPtr train, test;
00341       splitSampler(_samp, &train, &test, _trainProp);
00342 
00343       //printf("Presences: Orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence());
00344       //printf("Absences:  Orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence());
00345 
00346       AlgorithmPtr algo = AlgorithmFactory::newAlgorithm( _subAlgorithm );
00347       algo->setParameters( _param );
00348       algo->setSampler(train);
00349       algo->initialize();
00350       algRun = new AlgorithmRun(algo);
00351       algRun->initialize(runId++,
00352           _commissionSampleSize, train, test ); 
00353       _activeRun[_numActiveRuns++] = algRun;
00354       algRun->run();
00355     }
00356   }
00357 
00358   else
00359   {
00360     // no more runs are needed
00361     // check if all active threads have finished
00362     active = numActiveThreads();
00363     if (active)
00364     {
00365       // there are still threads running
00366       //  Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n", iterations, active);
00367       SLEEP(2); 
00368     }
00369     else
00370     {
00371       // all running threads terminated
00372       // calculate best subset and exit
00373       //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", iterations);
00374       calculateBestSubset();
00375       _done = true;
00376     }
00377   }
00378 
00379   return 1;
00380 }
00381 
00382 /****************************************************************/
00383 int AbstractBestSubsets::numActiveThreads()
00384 {
00385   int i;
00386   AlgorithmRun * run;
00387 
00388   for (i = 0; i < _numActiveRuns; i++)
00389   {
00390     run = _activeRun[i];
00391 
00392     if (!run->running())
00393     {
00394       //Log::instance()->info("Thread %d has just finished.\n", run->getId());
00395 
00396       // run finished its work
00397       // move it to finished runs
00398       // and remove it from list of active runs
00399       _finishedRun[_numFinishedRuns++] = run;
00400       _activeRun[i] = _activeRun[--_numActiveRuns];
00401       _activeRun[_numActiveRuns] = NULL;
00402 
00403       // update count of models under omission threshold
00404       if (!_softOmissionThreshold)
00405       {
00406         if (run->getOmission() <= _omissionThreshold)
00407         { _currentModelsUnderOmissionThreshold++; }
00408       }
00409     }
00410   }
00411 
00412   return _numActiveRuns;
00413 }
00414 
00415 /****************************************************************/
00416 int AbstractBestSubsets::earlyTerminationConditionMet()
00417 {
00418   return (!_softOmissionThreshold) &&
00419     (_currentModelsUnderOmissionThreshold >= _modelsUnderOmission);
00420 }
00421 
00422 /****************************************************************/
00423 int AbstractBestSubsets::calculateBestSubset()
00424 {
00425   int i;
00426 
00427   Log::instance()->info("Calculating best subset of models.\n");
00428 
00429   // make a copy of finished runs to play with
00430   AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns];
00431   for (i = 0; i < _numFinishedRuns; i++)
00432   { runList[i] = _finishedRun[i]; }
00433 
00434   printListOfRuns("Finished Runs:", runList, _numFinishedRuns);
00435 
00436   // get list of models that pass omission test
00437   // sort runs by omission
00438   // first <_modelsUnderOmission> runs are the selected ones
00439   sortRuns(runList, _numFinishedRuns, 0);
00440 
00441   printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns);
00442 
00443   // get list of models that pass commission test
00444   sortRuns(runList, _modelsUnderOmission, 1);
00445 
00446   printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns);
00447 
00448   _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 );
00449   int medianRun = _modelsUnderOmission / 2;
00450   int firstRun = (int)ceil( (double) medianRun - (double)_numBestRuns / 2.0 );
00451 
00452   _bestRun = new AlgorithmRun*[_numBestRuns];
00453 
00454   for (i = 0; i < _numBestRuns; i++)
00455   { _bestRun[i] = runList[i + firstRun]; }
00456 
00457   printListOfRuns("Best Runs:", _bestRun, _numBestRuns);
00458 
00459   printf("Median: %d First: %d\n", medianRun, firstRun);
00460 
00461 
00462   delete[] runList;
00463 
00464   Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns);
00465 
00466   return 1;
00467 }
00468 
00469 /****************************************************************/
00470 void AbstractBestSubsets::sortRuns(AlgorithmRun ** runList, 
00471     int nelements, int errorType)
00472 {
00473   int i, j;
00474   AlgorithmRun * runJ0, * runJ1;
00475 
00476   //Log::instance()->info("Sorting list %x of %d elements by index %d.\n", runList, nelements, errorType);
00477 
00478   // bubble sort
00479   // TODO: change to quicksort if this becomes a bottleneck
00480   for (i = 0; i < nelements - 1; i++)
00481   { 
00482     for (j = 0; j < nelements - i - 1; j++)
00483     {
00484       runJ0 = runList[j];
00485       runJ1 = runList[j + 1];
00486 
00487       if (runJ0->getError(errorType) > runJ1->getError(errorType))
00488       {
00489         // exchange elements j and j + 1
00490         runList[j] = runJ1;
00491         runList[j + 1] = runJ0;
00492       }
00493     }
00494   }
00495 }
00496 
00497 /****************************************************************/
00498 /****************** done ****************************************/
00499 
00500 int AbstractBestSubsets::done() const
00501 {
00502   return _done;
00503 }
00504 
00505 /****************************************************************/
00506 /****************** done ****************************************/
00507 
00508 float AbstractBestSubsets::getProgress() const
00509 {
00510   if (done())
00511   { return 1.0; } 
00512 
00513   else
00514   {
00515     float progByTotalRuns = 0.0;
00516     float progByHardOmission = 0.0;
00517 
00518     float avgProgressActiveRuns = 0.0;
00519     for (int i = 0; i < _numActiveRuns; i++)
00520     { avgProgressActiveRuns += _activeRun[i]->getProgress(); }
00521     avgProgressActiveRuns /= _numActiveRuns;
00522 
00523     progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns;
00524 
00525     if (!_softOmissionThreshold)
00526     {
00527       progByHardOmission = (_currentModelsUnderOmissionThreshold / 
00528           (float) _modelsUnderOmission); 
00529     }
00530 
00531     float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission;
00532 
00533     if (progress > _maxProgress)
00534     { _maxProgress = progress; }
00535 
00536     return _maxProgress;
00537   }
00538 }
00539 
00540 /****************************************************************/
00541 /****************** getValue ************************************/
00542 
00543 Scalar AbstractBestSubsets::getValue( const Sample& x ) const
00544 {
00545   int i;
00546   double sum = 0.0;
00547 
00548   if (_done)
00549   {
00550     for (i = 0; i < _numBestRuns; i++)
00551     { sum += _bestRun[i]->getValue(x); }
00552   }
00553 
00554   return sum / (double) _numBestRuns;
00555 }
00556 
00557 /****************************************************************/
00558 /****************** getConvergence ******************************/
00559 
00560 int AbstractBestSubsets::getConvergence( Scalar * const val ) const
00561 {
00562   *val = 0;
00563   return 0;
00564 }
00565