openModeller
Version 1.4.0
|
00001 00030 #include "best_subsets.hh" 00031 00032 #include <openmodeller/om.hh> 00033 #include <openmodeller/Random.hh> 00034 #include <openmodeller/Sampler.hh> 00035 #include <openmodeller/Exceptions.hh> 00036 00037 #include <stdlib.h> 00038 #include <stdio.h> 00039 #include <math.h> 00040 #include <string> 00041 using std::string; 00042 00043 #ifdef WIN32 00044 #include <windows.h> 00045 #define SLEEP(secs) Sleep(secs * 1000) 00046 #else 00047 #include <unistd.h> 00048 #define SLEEP(secs) sleep(secs); 00049 #endif 00050 00051 00052 /****************************************************************/ 00053 void BestSubsets::printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns) 00054 { 00055 printf("%s\n", msg.c_str()); 00056 for (int i = 0; i < numOfRuns; i++) 00057 printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i, 00058 runs[i]->getOmission(), 00059 runs[i]->getCommission(), 00060 runs[i]->getId()); 00061 } 00062 00063 /****************************************************************/ 00064 /****************** Garp class **********************************/ 00065 00066 BestSubsets::BestSubsets(AlgMetadata * metadata) 00067 : AlgorithmImpl(metadata) 00068 { 00069 _trainProp = 0.0; 00070 _totalRuns = 0; 00071 _omissionThreshold = 0.0; 00072 _modelsUnderOmission = 0; 00073 _commissionThreshold = 0.0; 00074 _commissionSampleSize = 0; 00075 00076 _softOmissionThreshold = false; 00077 _currentModelsUnderOmissionThreshold = 0; 00078 00079 _finishedRun = NULL; 00080 _activeRun = NULL; 00081 _bestRun = NULL; 00082 00083 _nparam = 0; 00084 _alg_params = NULL; 00085 00086 _numFinishedRuns = 0; 00087 _numActiveRuns = 0; 00088 _done = false; 00089 00090 _maxProgress = 0.0; 00091 00092 } 00093 00094 // **************************************************************** 00095 BestSubsets::~BestSubsets() 00096 { 00097 int i; 00098 00099 if (_finishedRun) 00100 { 00101 for (i = 0; i < _numFinishedRuns; i++) 00102 { delete _finishedRun[i]; } 00103 delete[] _finishedRun; 00104 } 00105 00106 if (_activeRun) 00107 { delete[] _activeRun; } 00108 00109 // bestRun just point to objects referenced by _finishedRun object 00110 if (_bestRun) 00111 { delete[] _bestRun; } 00112 00113 if (_alg_params) 00114 delete[] _alg_params; 00115 } 00116 00117 // **************************************************************** 00118 // ************* initialize *************************************** 00119 00120 int BestSubsets::initialize() 00121 { 00122 // BS parameters 00123 if (!getParameter("TrainingProportion", &_trainProp)) { 00124 Log::instance()->error("Parameter TrainingProportion not set properly.\n"); 00125 return 0; 00126 } 00127 00128 if (!getParameter("TotalRuns", &_totalRuns)) { 00129 Log::instance()->error("Parameter TotalRuns not set properly.\n"); 00130 return 0; 00131 } 00132 00133 if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) { 00134 Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n"); 00135 return 0; 00136 } 00137 00138 if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) { 00139 Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n"); 00140 return 0; 00141 } 00142 00143 if (!getParameter("CommissionThreshold", &_commissionThreshold)) { 00144 Log::instance()->error("Parameter CommissionThreshold not set properly.\n"); 00145 return 0; 00146 } 00147 00148 if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) { 00149 Log::instance()->error("Parameter CommissionSampleSize not set properly.\n"); 00150 return 0; 00151 } 00152 00153 if (!getParameter("MaxThreads", &_maxThreads)) { 00154 Log::instance()->error("Parameter MaxThreads not set properly.\n"); 00155 return 0; 00156 } 00157 00158 if ( _maxThreads < 1 ) 00159 { 00160 _maxThreads = 1; 00161 } 00162 else if ( _maxThreads > 1 ) 00163 { 00164 // When maxThreads is greater than 1, if the machine has only one processor om 00165 // can crash. If the machine has more than one processor GDAL can output lots 00166 // of IO errors (current GDAL version does not seem to be thread safe). 00167 Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n"); 00168 } 00169 00170 if (_trainProp <= 1.0) 00171 { 00172 Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n"); 00173 } 00174 00175 // convert percentages (100%) to proportions (1.0) for external parameters 00176 _trainProp /= 100.0; 00177 _commissionThreshold /= 100.0; 00178 _omissionThreshold /= 100.0; 00179 00180 _softOmissionThreshold = (_omissionThreshold >= 1.0); 00181 if (_modelsUnderOmission > _totalRuns) 00182 { 00183 Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns); 00184 _modelsUnderOmission = _totalRuns; 00185 } 00186 00187 _finishedRun = new AlgorithmRun*[_totalRuns]; 00188 _activeRun = new AlgorithmRun*[_maxThreads]; 00189 00190 transferParametersToAlgorithm(); 00191 00192 return 1; 00193 } 00194 00195 /****************************************************************/ 00196 /****************** iterate *************************************/ 00197 00198 int BestSubsets::iterate() 00199 { 00200 static int iterations = 0; 00201 int active; 00202 AlgorithmRun * algRun; 00203 00204 ++iterations; 00205 00206 if (_done) 00207 { return 1; } 00208 00209 // check if it should start new runs 00210 if ((_numFinishedRuns + _numActiveRuns < _totalRuns) && 00211 !earlyTerminationConditionMet()) 00212 { 00213 // it needs to start more runs 00214 // wait for a slot for a new thread 00215 if ((active = numActiveThreads()) >= _maxThreads) 00216 { 00217 //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads); 00218 SLEEP(2); 00219 } 00220 00221 else 00222 { 00223 int runId = _numFinishedRuns + _numActiveRuns; 00224 00225 //Log::instance()->info("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId); 00226 00227 // start new Algorithm 00228 SamplerPtr train, test; 00229 splitSampler( _samp, &train, &test, _trainProp ); 00230 00231 Log::instance()->debug( "Presences: orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence() ); 00232 Log::instance()->debug( "Absences: orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence() ); 00233 00234 algRun = new AlgorithmRun(); 00235 algRun->initialize(runId, 00236 _commissionSampleSize, train, test, 00237 _nparam, _alg_params, 00238 (BSAlgorithmFactory *) this); 00239 _activeRun[_numActiveRuns++] = algRun; 00240 algRun->run(); 00241 } 00242 } 00243 00244 else 00245 { 00246 // no more runs are needed 00247 // check if all active threads have finished 00248 active = numActiveThreads(); 00249 if (active) 00250 { 00251 // there are still threads running 00252 /* 00253 Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n", 00254 iterations, active); 00255 */ 00256 SLEEP(2); 00257 } 00258 00259 else 00260 { 00261 // all running threads terminated 00262 // calculate best subset and exit 00263 //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", i); 00264 calculateBestSubset(); 00265 _done = true; 00266 } 00267 } 00268 00269 return 1; 00270 } 00271 00272 /****************************************************************/ 00273 int BestSubsets::numActiveThreads() 00274 { 00275 int i; 00276 AlgorithmRun * run; 00277 00278 for (i = 0; i < _numActiveRuns; i++) 00279 { 00280 run = _activeRun[i]; 00281 00282 if (!run->running()) 00283 { 00284 //Log::instance()->info("Thread %d has just finished.\n", run->getId()); 00285 00286 // run finished its work 00287 // move it to finished runs 00288 // and remove it from list of active runs 00289 _finishedRun[_numFinishedRuns++] = run; 00290 _activeRun[i] = _activeRun[--_numActiveRuns]; 00291 _activeRun[_numActiveRuns] = NULL; 00292 00293 // update count of models under omission threshold 00294 if (!_softOmissionThreshold) 00295 { 00296 if (run->getOmission() <= _omissionThreshold) 00297 { _currentModelsUnderOmissionThreshold++; } 00298 } 00299 } 00300 } 00301 00302 return _numActiveRuns; 00303 } 00304 00305 /****************************************************************/ 00306 int BestSubsets::earlyTerminationConditionMet() 00307 { 00308 return (!_softOmissionThreshold) && 00309 (_currentModelsUnderOmissionThreshold >= _modelsUnderOmission); 00310 } 00311 00312 /****************************************************************/ 00313 int BestSubsets::calculateBestSubset() 00314 { 00315 int i; 00316 00317 Log::instance()->info("Calculating best subset of models.\n"); 00318 00319 // make a copy of finished runs to play with 00320 AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns]; 00321 for (i = 0; i < _numFinishedRuns; i++) 00322 { runList[i] = _finishedRun[i]; } 00323 00324 printListOfRuns("Finished Runs:", runList, _numFinishedRuns); 00325 00326 // get list of models that pass omission test 00327 // sort runs by omission 00328 // first <_modelsUnderOmission> runs are the selected ones 00329 sortRuns(runList, _numFinishedRuns, 0); 00330 00331 printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns); 00332 00333 // get list of models that pass commission test 00334 sortRuns(runList, _modelsUnderOmission, 1); 00335 00336 printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns); 00337 00338 _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 ); 00339 int medianRun = _modelsUnderOmission / 2; 00340 int firstRun = (int)ceil( (double)medianRun - (double)_numBestRuns / 2.0 ); 00341 00342 _bestRun = new AlgorithmRun*[_numBestRuns]; 00343 00344 for (i = 0; i < _numBestRuns; i++) 00345 { _bestRun[i] = runList[i + firstRun]; } 00346 00347 printListOfRuns("Best Runs:", _bestRun, _numBestRuns); 00348 00349 printf("Median: %d First: %d\n", medianRun, firstRun); 00350 00351 00352 delete[] runList; 00353 00354 Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns); 00355 00356 return 1; 00357 } 00358 00359 /****************************************************************/ 00360 void BestSubsets::sortRuns(AlgorithmRun ** runList, 00361 int nelements, int errorType) 00362 { 00363 int i, j; 00364 AlgorithmRun * runJ0, * runJ1; 00365 00366 Log::instance()->info("Sorting list %d of %d elements by index %d.\n", runList, nelements, errorType); 00367 00368 // bubble sort 00369 // TODO: change to quicksort if this becomes a bottleneck 00370 for (i = 0; i < nelements - 1; i++) 00371 { 00372 for (j = 0; j < nelements - i - 1; j++) 00373 { 00374 runJ0 = runList[j]; 00375 runJ1 = runList[j + 1]; 00376 00377 if (runJ0->getError(errorType) > runJ1->getError(errorType)) 00378 { 00379 // exchange elements j and j + 1 00380 runList[j] = runJ1; 00381 runList[j + 1] = runJ0; 00382 } 00383 } 00384 } 00385 } 00386 00387 /****************************************************************/ 00388 /****************** done ****************************************/ 00389 00390 int BestSubsets::done() const 00391 { 00392 return _done; 00393 } 00394 00395 /****************************************************************/ 00396 /****************** done ****************************************/ 00397 00398 float BestSubsets::getProgress() const 00399 { 00400 if (done()) 00401 { return 1.0; } 00402 00403 else 00404 { 00405 float progByTotalRuns = 0.0; 00406 float progByHardOmission = 0.0; 00407 00408 float avgProgressActiveRuns = 0.0; 00409 for (int i = 0; i < _numActiveRuns; i++) 00410 { avgProgressActiveRuns += _activeRun[i]->getProgress(); } 00411 avgProgressActiveRuns /= _numActiveRuns; 00412 00413 progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns; 00414 00415 if (!_softOmissionThreshold) 00416 { 00417 progByHardOmission = (_currentModelsUnderOmissionThreshold / 00418 (float) _modelsUnderOmission); 00419 } 00420 00421 float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission; 00422 00423 if (progress > _maxProgress) 00424 { _maxProgress = progress; } 00425 00426 return _maxProgress; 00427 } 00428 } 00429 00430 /****************************************************************/ 00431 /****************** getValue ************************************/ 00432 00433 Scalar BestSubsets::getValue( const Sample& x ) const 00434 { 00435 int i; 00436 double sum = 0.0; 00437 00438 if (_done) 00439 { 00440 for (i = 0; i < _numBestRuns; i++) 00441 { sum += _bestRun[i]->getValue(x); } 00442 } 00443 00444 return sum / (double) _numBestRuns; 00445 } 00446 00447 /****************************************************************/ 00448 /****************** getConvergence ******************************/ 00449 00450 int BestSubsets::getConvergence( Scalar * const val ) const 00451 { 00452 *val = 0; 00453 return 0; 00454 } 00455 00456 /****************************************************************/ 00457 /****************** getConfiguration ****************************/ 00458 void BestSubsets::_getConfiguration( ConfigurationPtr& config ) const 00459 { 00460 if ( !_done ) { 00461 00462 return; 00463 } 00464 00465 ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") ); 00466 config->addSubsection( model_config ); 00467 00468 model_config->addNameValue("Count", _numBestRuns); 00469 00470 for ( int i=0; i < _numBestRuns; i++ ) { 00471 00472 ConfigurationPtr child_config( new ConfigurationImpl("Run") ); 00473 child_config->addNameValue( "Id", i ); 00474 child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 ); 00475 child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 ); 00476 00477 ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration(); 00478 child_config->addSubsection( alg_config ); 00479 00480 model_config->addSubsection( child_config ); 00481 } 00482 } 00483 00484 /***************************************************************/ 00485 /****************** setConfiguration ***************************/ 00486 void BestSubsets::_setConfiguration( const ConstConfigurationPtr& config ) 00487 { 00488 ConstConfigurationPtr model_config = config->getSubsection( "BestSubsets", false ); 00489 00490 if ( ! model_config ) { 00491 00492 return; 00493 } 00494 00495 _done = true; 00496 00497 _numBestRuns = model_config->getAttributeAsInt( "Count", 0 ); 00498 00499 _bestRun = new AlgorithmRun*[_numBestRuns]; 00500 00501 Configuration::subsection_list runs = model_config->getAllSubsections(); 00502 00503 Configuration::subsection_list::const_iterator fin = runs.end(); 00504 Configuration::subsection_list::const_iterator it = runs.begin(); 00505 // The index i is used to populate the _bestRuns array it is incremented after each 00506 // new algorithm is found. 00507 int i; 00508 for ( i = 0; it != fin; ++it ) { 00509 00510 // Test this here rather than at the bottom of loop. 00511 // This needs to be done after checking for loop terminal condition. 00512 if ( i == _numBestRuns ) { 00513 00514 throw ConfigurationException( "Number of deserialized algorithms exceeds Count" ); 00515 } 00516 00517 ConstConfigurationPtr run_config = *it; 00518 00519 if ( run_config->getName() != "Run" ) { 00520 00521 continue; 00522 } 00523 00524 AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") ); 00525 00526 _bestRun[i] = new AlgorithmRun( alg ); 00527 00528 // increment i after adding algorithmRun to _bestRun 00529 ++i; 00530 } 00531 00532 if ( i < _numBestRuns ) { 00533 00534 throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" ); 00535 } 00536 }