openModeller
Version 1.4.0
|
00001 00029 #ifdef WIN32 00030 // avoid warnings caused by problems in VC headers 00031 #define _SCL_SECURE_NO_DEPRECATE 00032 #endif 00033 00034 #include <string> 00035 using std::string; 00036 00037 #include "AbstractBestSubsets.hh" 00038 #include <openmodeller/Exceptions.hh> 00039 00040 #include <math.h> // for function ceil() 00041 00042 #ifdef WIN32 00043 #include <windows.h> 00044 #define SLEEP(secs) Sleep(secs * 1000) 00045 #else 00046 #include <unistd.h> 00047 #define SLEEP(secs) sleep(secs); 00048 #endif 00049 00050 /****************************************************************/ 00051 static void printListOfRuns(string msg, AlgorithmRun ** runs, int numOfRuns) 00052 { 00053 printf("%s\n", msg.c_str()); 00054 for (int i = 0; i < numOfRuns; i++) 00055 printf("%4d] om=%5.3f comm=%5.3f (id=%d)\n", i, 00056 runs[i]->getOmission(), 00057 runs[i]->getCommission(), 00058 runs[i]->getId()); 00059 } 00060 00061 /****************************************************************/ 00062 /****************** Garp class **********************************/ 00063 00064 AbstractBestSubsets::AbstractBestSubsets( AlgMetadata const * metadata_bs) : 00065 AlgorithmImpl( metadata_bs ) 00066 { 00067 _trainProp = 0.0; 00068 _totalRuns = 0; 00069 _omissionThreshold = 0.0; 00070 _modelsUnderOmission = 0; 00071 _commissionThreshold = 0.0; 00072 _commissionSampleSize = 0; 00073 _maxThreads = 0; 00074 00075 _softOmissionThreshold = false; 00076 _currentModelsUnderOmissionThreshold = 0; 00077 00078 _finishedRun = NULL; 00079 _activeRun = NULL; 00080 _bestRun = NULL; 00081 00082 _numFinishedRuns = 0; 00083 _numActiveRuns = 0; 00084 _done = false; 00085 00086 _maxProgress = 0.0; 00087 00088 } 00089 00090 // **************************************************************** 00091 AbstractBestSubsets::~AbstractBestSubsets() 00092 { 00093 int i; 00094 00095 if (_finishedRun) 00096 { 00097 for (i = 0; i < _numFinishedRuns; i++) 00098 { delete _finishedRun[i]; } 00099 delete[] _finishedRun; 00100 } 00101 00102 if (_activeRun) 00103 { delete[] _activeRun; } 00104 00105 // bestRun just point to objects referenced by _finishedRun object 00106 if (_bestRun) 00107 { delete[] _bestRun; } 00108 00109 } 00110 00111 /****************************************************************/ 00112 /****************** configuration *******************************/ 00113 void 00114 AbstractBestSubsets::_getConfiguration( ConfigurationPtr& config ) const 00115 { 00116 00117 if ( !_done ) 00118 return; 00119 00120 ConfigurationPtr model_config( new ConfigurationImpl("BestSubsets") ); 00121 config->addSubsection( model_config ); 00122 00123 model_config->addNameValue("Count", _numBestRuns); 00124 00125 for( int i=0; i<_numBestRuns; i++ ) { 00126 ConfigurationPtr child_config( new ConfigurationImpl("Run" ) ); 00127 child_config->addNameValue( "Id", i ); 00128 child_config->addNameValue( "OmissionError", _bestRun[i]->getOmission() * 100 ); 00129 child_config->addNameValue( "CommissionError", _bestRun[i]->getCommission() * 100 ); 00130 00131 ConfigurationPtr alg_config = _bestRun[i]->getAlgorithm()->getConfiguration(); 00132 child_config->addSubsection( alg_config ); 00133 00134 model_config->addSubsection( child_config ); 00135 00136 } 00137 00138 } 00139 00140 void 00141 AbstractBestSubsets::_setConfiguration( const ConstConfigurationPtr& config ) 00142 { 00143 00144 ConstConfigurationPtr model_config = config->getSubsection("BestSubsets",false); 00145 if (!model_config) 00146 return; 00147 00148 _done = true; 00149 00150 _numBestRuns = model_config->getAttributeAsInt( "Count", 0 ); 00151 00152 _bestRun = new AlgorithmRun*[_numBestRuns]; 00153 00154 Configuration::subsection_list runs = model_config->getAllSubsections(); 00155 00156 Configuration::subsection_list::const_iterator fin = runs.end(); 00157 Configuration::subsection_list::const_iterator it = runs.begin(); 00158 // The index i is used to populate the _bestRuns array it is incremented after each 00159 // new algorithm is found. 00160 int i; 00161 for( i = 0; it != fin; ++it ) { 00162 00163 // Test this here rather than at the bottom of loop. 00164 // This needs to be done after checking for loop terminal condition. 00165 if ( i == _numBestRuns ) { 00166 throw ConfigurationException( "Number of deserialized algorithms exceeds Count" ); 00167 } 00168 00169 ConstConfigurationPtr run_config = *it; 00170 00171 if ( run_config->getName() != "Run" ) { 00172 continue; 00173 } 00174 00175 AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( run_config->getSubsection("Algorithm") ); 00176 00177 _bestRun[i] = new AlgorithmRun( alg ); 00178 00179 // increment i after adding algorithmRun to _bestRun 00180 ++i; 00181 } 00182 00183 if ( i < _numBestRuns ) { 00184 throw ConfigurationException( "Number of deserialized algorithms is smaller than Count" ); 00185 } 00186 } 00187 00188 // **************************************************************** 00189 // ************* needNormalization ******************************** 00190 00191 int AbstractBestSubsets::needNormalization() 00192 { 00193 // This is a hack. needNormalization is called before initialize. 00194 AbstractBestSubsets *non_const = const_cast<AbstractBestSubsets*>(this); 00195 if ( _subAlgorithm.empty() ) { 00196 if ( !non_const->getParameter("SubAlgorithm", & _subAlgorithm ) ) { 00197 std::string error = "Parameter SubAlgorithm not set properly."; 00198 Log::instance()->error( error.c_str() ); 00199 throw AlgorithmException( error ); 00200 } 00201 } 00202 00203 AlgorithmPtr alg = AlgorithmFactory::newAlgorithm( _subAlgorithm ); 00204 00205 if ( alg->needNormalization() ) { 00206 00207 Log::instance()->info( "Computing normalization in best subsets\n"); 00208 00209 // Compute normalization here to avoid computing it again in each GARP run 00210 // note: getNormalizer will return a copy 00211 _normalizerPtr = alg->getNormalizer(); 00212 00213 if ( _normalizerPtr ) { 00214 00215 _normalizerPtr->computeNormalization( _samp ); 00216 00217 setNormalization( _samp ); 00218 } 00219 } 00220 00221 // No need to normalize again 00222 return 0; 00223 } 00224 00225 // **************************************************************** 00226 // ************* initialize *************************************** 00227 00228 int AbstractBestSubsets::initialize() 00229 { 00230 // BS parameters 00231 #if 0 00232 if (!getParameter("SubAlgorithm", &_subAlgorithm)) { 00233 Log::instance()->error("Parameter SubAlgorithm not set properly.\n"); 00234 return 0; 00235 } 00236 #endif 00237 00238 if (!getParameter("TrainingProportion", &_trainProp)) { 00239 Log::instance()->error("Parameter TrainingProportion not set properly.\n"); 00240 return 0; 00241 } 00242 00243 if (!getParameter("TotalRuns", &_totalRuns)) { 00244 Log::instance()->error("Parameter TotalRuns not set properly.\n"); 00245 return 0; 00246 } 00247 00248 if (!getParameter("HardOmissionThreshold", &_omissionThreshold)) { 00249 Log::instance()->error("Parameter HardOmissionThreshold not set properly.\n"); 00250 return 0; 00251 } 00252 00253 if (!getParameter("ModelsUnderOmissionThreshold", &_modelsUnderOmission)) { 00254 Log::instance()->error("Parameter ModelsUnderOmissionThreshold not set properly.\n"); 00255 return 0; 00256 } 00257 00258 if (!getParameter("CommissionThreshold", &_commissionThreshold)) { 00259 Log::instance()->error("Parameter CommissionThreshold not set properly.\n"); 00260 return 0; 00261 } 00262 00263 if (!getParameter("CommissionSampleSize", &_commissionSampleSize)) { 00264 Log::instance()->error("Parameter CommissionSampleSize not set properly.\n"); 00265 return 0; 00266 } 00267 00268 if (!getParameter("MaxThreads", &_maxThreads)) { 00269 Log::instance()->error("Parameter MaxThreads not set properly.\n"); 00270 return 0; 00271 } 00272 00273 if ( _maxThreads < 1 ) 00274 { 00275 _maxThreads = 1; 00276 } 00277 else if ( _maxThreads > 1 ) 00278 { 00279 // When maxThreads is greater than 1, if the machine has only one processor om 00280 // can crash. If the machine has more than one processor GDAL can output lots 00281 // of IO errors (current GDAL version does not seem to be thread safe). 00282 Log::instance()->warn("Multithreading is still experimental. When max threads is greater than 1, depending on software and hardware configuration this application may crash or you may see lots of raster IO warnings. In these cases, we recommend you to set this parameter to 1.\n"); 00283 } 00284 00285 if (_trainProp <= 1.0) 00286 { 00287 Log::instance()->warn("The specified training proportion value is less than or equals 1. Please note that there was a change in the valid range for this parameter from 0-1 to 0-100. Small values may result in zero presence points being used to train the model.\n"); 00288 } 00289 00290 // convert percentages (100%) to proportions (1.0) for external parameters 00291 _trainProp /= 100.0; 00292 _commissionThreshold /= 100.0; 00293 _omissionThreshold /= 100.0; 00294 00295 _softOmissionThreshold = (_omissionThreshold >= 1.0); 00296 if (_modelsUnderOmission > _totalRuns) 00297 { 00298 Log::instance()->warn("ModelsUnderOmission (%d) is greater than the number of runs (%d). ModelsUnderOmission will be reduced to (%d)\n", _modelsUnderOmission, _totalRuns, _totalRuns); 00299 _modelsUnderOmission = _totalRuns; 00300 } 00301 00302 _finishedRun = new AlgorithmRun*[_totalRuns]; 00303 _activeRun = new AlgorithmRun*[_maxThreads]; 00304 00305 return 1; 00306 } 00307 00308 /****************************************************************/ 00309 /****************** iterate *************************************/ 00310 00311 int AbstractBestSubsets::iterate() 00312 { 00313 static int iterations = 0; 00314 static int runId = 0; 00315 int active; 00316 AlgorithmRun * algRun; 00317 00318 ++iterations; 00319 00320 if (_done) 00321 { return 1; } 00322 00323 // check if it should start new runs 00324 if ((_numFinishedRuns + _numActiveRuns < _totalRuns) && 00325 !earlyTerminationConditionMet()) 00326 { 00327 // it needs to start more runs 00328 // wait for a slot for a new thread 00329 if ((active = numActiveThreads()) >= _maxThreads) 00330 { 00331 //Log::instance()->info("%5d] Waiting for a slot to run next thread (%d out of %d)\n", iterations, active, _maxThreads); 00332 SLEEP(2); 00333 } 00334 00335 else 00336 { 00337 //Log::instance()->debug("%5d] There is an empty slot to run next thread (%d out of %d) - %d\n", iterations, active, _maxThreads, runId); 00338 00339 // start new Algorithm 00340 SamplerPtr train, test; 00341 splitSampler(_samp, &train, &test, _trainProp); 00342 00343 //printf("Presences: Orig=%d, train=%d, test=%d\n", _samp->numPresence(), train->numPresence(), test->numPresence()); 00344 //printf("Absences: Orig=%d, train=%d, test=%d\n", _samp->numAbsence(), train->numAbsence(), test->numAbsence()); 00345 00346 AlgorithmPtr algo = AlgorithmFactory::newAlgorithm( _subAlgorithm ); 00347 algo->setParameters( _param ); 00348 algo->setSampler(train); 00349 algo->initialize(); 00350 algRun = new AlgorithmRun(algo); 00351 algRun->initialize(runId++, 00352 _commissionSampleSize, train, test ); 00353 _activeRun[_numActiveRuns++] = algRun; 00354 algRun->run(); 00355 } 00356 } 00357 00358 else 00359 { 00360 // no more runs are needed 00361 // check if all active threads have finished 00362 active = numActiveThreads(); 00363 if (active) 00364 { 00365 // there are still threads running 00366 // Log::instance()->info("%5d] Waiting for %d active thread(s) to finish.\n", iterations, active); 00367 SLEEP(2); 00368 } 00369 else 00370 { 00371 // all running threads terminated 00372 // calculate best subset and exit 00373 //Log::instance()->info("%5d] Calculating best and terminating algorithm.\n", iterations); 00374 calculateBestSubset(); 00375 _done = true; 00376 } 00377 } 00378 00379 return 1; 00380 } 00381 00382 /****************************************************************/ 00383 int AbstractBestSubsets::numActiveThreads() 00384 { 00385 int i; 00386 AlgorithmRun * run; 00387 00388 for (i = 0; i < _numActiveRuns; i++) 00389 { 00390 run = _activeRun[i]; 00391 00392 if (!run->running()) 00393 { 00394 //Log::instance()->info("Thread %d has just finished.\n", run->getId()); 00395 00396 // run finished its work 00397 // move it to finished runs 00398 // and remove it from list of active runs 00399 _finishedRun[_numFinishedRuns++] = run; 00400 _activeRun[i] = _activeRun[--_numActiveRuns]; 00401 _activeRun[_numActiveRuns] = NULL; 00402 00403 // update count of models under omission threshold 00404 if (!_softOmissionThreshold) 00405 { 00406 if (run->getOmission() <= _omissionThreshold) 00407 { _currentModelsUnderOmissionThreshold++; } 00408 } 00409 } 00410 } 00411 00412 return _numActiveRuns; 00413 } 00414 00415 /****************************************************************/ 00416 int AbstractBestSubsets::earlyTerminationConditionMet() 00417 { 00418 return (!_softOmissionThreshold) && 00419 (_currentModelsUnderOmissionThreshold >= _modelsUnderOmission); 00420 } 00421 00422 /****************************************************************/ 00423 int AbstractBestSubsets::calculateBestSubset() 00424 { 00425 int i; 00426 00427 Log::instance()->info("Calculating best subset of models.\n"); 00428 00429 // make a copy of finished runs to play with 00430 AlgorithmRun ** runList = new AlgorithmRun*[_numFinishedRuns]; 00431 for (i = 0; i < _numFinishedRuns; i++) 00432 { runList[i] = _finishedRun[i]; } 00433 00434 printListOfRuns("Finished Runs:", runList, _numFinishedRuns); 00435 00436 // get list of models that pass omission test 00437 // sort runs by omission 00438 // first <_modelsUnderOmission> runs are the selected ones 00439 sortRuns(runList, _numFinishedRuns, 0); 00440 00441 printListOfRuns("Finished Runs by Omission:", runList, _numFinishedRuns); 00442 00443 // get list of models that pass commission test 00444 sortRuns(runList, _modelsUnderOmission, 1); 00445 00446 printListOfRuns("Best Omission Runs by Commission:", runList, _numFinishedRuns); 00447 00448 _numBestRuns = (int)( _commissionThreshold * (double)_modelsUnderOmission + 0.5 ); 00449 int medianRun = _modelsUnderOmission / 2; 00450 int firstRun = (int)ceil( (double) medianRun - (double)_numBestRuns / 2.0 ); 00451 00452 _bestRun = new AlgorithmRun*[_numBestRuns]; 00453 00454 for (i = 0; i < _numBestRuns; i++) 00455 { _bestRun[i] = runList[i + firstRun]; } 00456 00457 printListOfRuns("Best Runs:", _bestRun, _numBestRuns); 00458 00459 printf("Median: %d First: %d\n", medianRun, firstRun); 00460 00461 00462 delete[] runList; 00463 00464 Log::instance()->info("Selected best %d models out of %d.\n", _numBestRuns, _totalRuns); 00465 00466 return 1; 00467 } 00468 00469 /****************************************************************/ 00470 void AbstractBestSubsets::sortRuns(AlgorithmRun ** runList, 00471 int nelements, int errorType) 00472 { 00473 int i, j; 00474 AlgorithmRun * runJ0, * runJ1; 00475 00476 //Log::instance()->info("Sorting list %x of %d elements by index %d.\n", runList, nelements, errorType); 00477 00478 // bubble sort 00479 // TODO: change to quicksort if this becomes a bottleneck 00480 for (i = 0; i < nelements - 1; i++) 00481 { 00482 for (j = 0; j < nelements - i - 1; j++) 00483 { 00484 runJ0 = runList[j]; 00485 runJ1 = runList[j + 1]; 00486 00487 if (runJ0->getError(errorType) > runJ1->getError(errorType)) 00488 { 00489 // exchange elements j and j + 1 00490 runList[j] = runJ1; 00491 runList[j + 1] = runJ0; 00492 } 00493 } 00494 } 00495 } 00496 00497 /****************************************************************/ 00498 /****************** done ****************************************/ 00499 00500 int AbstractBestSubsets::done() const 00501 { 00502 return _done; 00503 } 00504 00505 /****************************************************************/ 00506 /****************** done ****************************************/ 00507 00508 float AbstractBestSubsets::getProgress() const 00509 { 00510 if (done()) 00511 { return 1.0; } 00512 00513 else 00514 { 00515 float progByTotalRuns = 0.0; 00516 float progByHardOmission = 0.0; 00517 00518 float avgProgressActiveRuns = 0.0; 00519 for (int i = 0; i < _numActiveRuns; i++) 00520 { avgProgressActiveRuns += _activeRun[i]->getProgress(); } 00521 avgProgressActiveRuns /= _numActiveRuns; 00522 00523 progByTotalRuns = (_numFinishedRuns + avgProgressActiveRuns) / (float) _totalRuns; 00524 00525 if (!_softOmissionThreshold) 00526 { 00527 progByHardOmission = (_currentModelsUnderOmissionThreshold / 00528 (float) _modelsUnderOmission); 00529 } 00530 00531 float progress = (progByTotalRuns > progByHardOmission)? progByTotalRuns : progByHardOmission; 00532 00533 if (progress > _maxProgress) 00534 { _maxProgress = progress; } 00535 00536 return _maxProgress; 00537 } 00538 } 00539 00540 /****************************************************************/ 00541 /****************** getValue ************************************/ 00542 00543 Scalar AbstractBestSubsets::getValue( const Sample& x ) const 00544 { 00545 int i; 00546 double sum = 0.0; 00547 00548 if (_done) 00549 { 00550 for (i = 0; i < _numBestRuns; i++) 00551 { sum += _bestRun[i]->getValue(x); } 00552 } 00553 00554 return sum / (double) _numBestRuns; 00555 } 00556 00557 /****************************************************************/ 00558 /****************** getConvergence ******************************/ 00559 00560 int AbstractBestSubsets::getConvergence( Scalar * const val ) const 00561 { 00562 *val = 0; 00563 return 0; 00564 } 00565