openModeller
Version 1.4.0
|
00001 00032 #include <stdio.h> 00033 00034 #include <openmodeller/RocCurve.hh> 00035 #include <openmodeller/Sampler.hh> 00036 #include <openmodeller/Algorithm.hh> 00037 #include <openmodeller/Occurrences.hh> 00038 #include <openmodeller/Environment.hh> 00039 #include <openmodeller/Configuration.hh> 00040 #include <openmodeller/Log.hh> 00041 00042 #include <openmodeller/Exceptions.hh> 00043 00044 #include <string.h> 00045 #include <algorithm> 00046 #include <map> 00047 00048 #include <math.h> 00049 #ifdef MSVC 00050 #include <float.h> //for _isnan 00051 #endif 00052 00053 using namespace std; 00054 00055 /*******************/ 00056 /*** constructor ***/ 00057 RocCurve::RocCurve() 00058 { 00059 initialize(); 00060 } 00061 00062 /*******************/ 00063 /*** destructor ***/ 00064 RocCurve::~RocCurve() 00065 { 00066 } 00067 00068 /******************/ 00069 /*** initialize ***/ 00070 void RocCurve::initialize( int resolution ) 00071 { 00072 _resolution = resolution; 00073 _approach = 0; // to be set later 00074 _num_background_points = ROC_DEFAULT_BACKGROUND_POINTS; // when no absences are provided 00075 _use_absences_as_background = false; 00076 } 00077 00078 /******************/ 00079 /*** initialize ***/ 00080 void RocCurve::initialize( int resolution, int num_background_points ) 00081 { 00082 _resolution = resolution; 00083 _approach = 2; 00084 _num_background_points = num_background_points; 00085 _use_absences_as_background = false; 00086 } 00087 00088 /******************/ 00089 /*** initialize ***/ 00090 void RocCurve::initialize( int resolution, bool use_absences_as_background ) 00091 { 00092 _resolution = resolution; 00093 _approach = 2; 00094 _num_background_points = 0; // to be set later (= number of absence points) 00095 _use_absences_as_background = use_absences_as_background; 00096 } 00097 00098 /*************/ 00099 /*** reset ***/ 00100 void RocCurve::reset() 00101 { 00102 Log::instance()->debug( "Resetting ROC curve\n" ); 00103 00104 _ready = false; 00105 _category.erase( _category.begin(), _category.end() ); 00106 _prediction.erase( _prediction.begin(), _prediction.end() ); 00107 _data.erase( _data.begin(), _data.end() ); 00108 _true_negatives = 0; 00109 _true_positives = 0; 00110 _auc = -1.0; 00111 00112 _proportions.erase( _proportions.begin(), _proportions.end() ); 00113 _proportions.reserve( _resolution ); 00114 00115 _thresholds.erase( _thresholds.begin(), _thresholds.end() ); 00116 _thresholds.reserve( _resolution ); 00117 00118 // Compute thresholds 00119 for ( int i = 0; i < _resolution; i++ ) { 00120 00121 _thresholds.push_back( Scalar(i) / ( _resolution - 1 ) ); 00122 _proportions.push_back( 0 ); 00123 } 00124 00125 _ratios.clear(); 00126 } 00127 00128 00129 /*****************/ 00130 /*** calculate ***/ 00131 void RocCurve::calculate( const Model& model, const SamplerPtr& sampler ) 00132 { 00133 Log::instance()->info( "Calculating ROC curve\n" ); 00134 00135 reset(); 00136 00137 model->setNormalization( sampler ); 00138 00139 _loadPredictions( model, sampler ); 00140 00141 _calculateGraphPoints(); 00142 00143 _ready = true; 00144 } 00145 00146 00147 /************************/ 00148 /*** load Predictions ***/ 00149 void RocCurve::_loadPredictions( const Model& model, const SamplerPtr& sampler ) 00150 { 00151 // Check parameters 00152 00153 EnvironmentPtr env = sampler->getEnvironment(); 00154 00155 if ( ! env ) { 00156 00157 std::string msg = "No environment specified for the ROC curve\n"; 00158 00159 Log::instance()->error( msg.c_str() ); 00160 throw InvalidParameterException( msg ); 00161 } 00162 00163 OccurrencesPtr presences = sampler->getPresences(); 00164 OccurrencesPtr absences = sampler->getAbsences(); 00165 00166 if ( ! presences ) { 00167 00168 std::string msg = "No presence points specified for the ROC curve\n"; 00169 00170 Log::instance()->error( msg.c_str() ); 00171 throw InvalidParameterException( msg ); 00172 } 00173 00174 // Set approach and check parameters 00175 00176 int size = presences->numOccurrences(); 00177 00178 if ( absences && ! absences->isEmpty() ) { 00179 00180 if ( _approach == 0 ) { // undefined 00181 00182 _approach = 1; // traditional approach 00183 } 00184 00185 int num_absences = absences->numOccurrences(); 00186 00187 if ( _approach == 1 ) { // traditional approach 00188 00189 size += num_absences; 00190 } 00191 else { 00192 00193 if ( _use_absences_as_background ) { 00194 00195 if ( absences->isEmpty() ) { 00196 00197 std::string msg = "Cannot use absences as background points for the ROC curve when no absences are provided\n"; 00198 00199 Log::instance()->error( msg.c_str() ); 00200 throw InvalidParameterException( msg ); 00201 } 00202 00203 _num_background_points = num_absences; 00204 } 00205 else { 00206 00207 Log::instance()->info( "Ignoring absences for the ROC curve\n" ); 00208 } 00209 } 00210 } 00211 else { 00212 00213 // No absences provided 00214 00215 if ( _approach == 0 ) { // undefined 00216 00217 _approach = 2; // proportional area 00218 } 00219 else if ( _approach == 1 ) { // traditional approach 00220 00221 std::string msg = "Cannot calculate traditional ROC curve when no absences are provided\n"; 00222 00223 Log::instance()->error( msg.c_str() ); 00224 throw InvalidParameterException( msg ); 00225 } 00226 00227 if ( _use_absences_as_background ) { 00228 00229 std::string msg = "Cannot use absences as background points for the ROC curve when no absences are provided\n"; 00230 00231 Log::instance()->error( msg.c_str() ); 00232 throw InvalidParameterException( msg ); 00233 } 00234 } 00235 00236 // Load predictions 00237 00238 _category.reserve( size ); 00239 _prediction.reserve( size ); 00240 00241 OccurrencesImpl::const_iterator it = presences->begin(); 00242 OccurrencesImpl::const_iterator fin = presences->end(); 00243 00244 Scalar predictionValue; 00245 00246 // Store predictions for presence points 00247 int i = 0; 00248 while( it != fin ) { 00249 00250 Sample sample; 00251 00252 if ( (*it)->hasEnvironment() ) { 00253 00254 sample = (*it)->environment(); 00255 } 00256 else { 00257 00258 sample = env->get( (*it)->x(), (*it)->y() ); 00259 } 00260 00261 if ( sample.size() > 0 ) { 00262 00263 ++i; 00264 00265 predictionValue = model->getValue( sample ); 00266 00267 _category.push_back( 1 ); 00268 _prediction.push_back( predictionValue ); 00269 00270 Log::instance()->debug( "Probability for presence point %s (%f,%f): %f\n", 00271 ((*it)->id()).c_str(), (*it)->x(), (*it)->y(), predictionValue ); 00272 } 00273 else { 00274 00275 Log::instance()->warn( "Skipping point (%s) with no environmental data!\n", 00276 ((*it)->id()).c_str() ); 00277 } 00278 00279 ++it; 00280 } 00281 00282 // Store predictions for absence points 00283 i = 0; 00284 if ( _approach == 1 ) { 00285 00286 it = absences->begin(); 00287 fin = absences->end(); 00288 00289 while( it != fin ) { 00290 00291 Sample sample; 00292 00293 if ( (*it)->hasEnvironment() ) { 00294 00295 sample = (*it)->environment(); 00296 } 00297 else { 00298 00299 sample = env->get( (*it)->x(), (*it)->y() ); 00300 } 00301 00302 if ( sample.size() > 0 ) { 00303 00304 ++i; 00305 00306 predictionValue = model->getValue( sample ); 00307 00308 _category.push_back( 0 ); 00309 _prediction.push_back( predictionValue ); 00310 00311 Log::instance()->debug( "Probability for absence point %s (%f,%f): %f\n", 00312 ((*it)->id()).c_str(), (*it)->x(), (*it)->y(), predictionValue ); 00313 } 00314 else { 00315 00316 Log::instance()->warn( "Skipping point (%s) with no environmental data!\n", 00317 ((*it)->id()).c_str() ); 00318 } 00319 00320 ++it; 00321 } 00322 } 00323 else { 00324 00325 if ( _use_absences_as_background ) { 00326 00327 Log::instance()->info( "Using %d absences as background for the ROC curve\n", _num_background_points ); 00328 00329 it = absences->begin(); 00330 fin = absences->end(); 00331 00332 while( it != fin ) { 00333 00334 Sample sample; 00335 00336 if ( (*it)->hasEnvironment() ) { 00337 00338 sample = (*it)->environment(); 00339 } 00340 else { 00341 00342 sample = env->get( (*it)->x(), (*it)->y() ); 00343 } 00344 00345 if ( sample.size() > 0 ) { 00346 00347 ++i; 00348 00349 predictionValue = model->getValue( sample ); 00350 00351 for ( unsigned int j = 0; j < _thresholds.size(); j++ ) { 00352 00353 if ( predictionValue < _thresholds[j] ) { 00354 00355 break; 00356 } 00357 00358 _proportions[j] += 1; 00359 } 00360 } 00361 00362 ++it; 00363 } 00364 } 00365 else { 00366 00367 // Generate background points 00368 00369 Log::instance()->info( "Generating %d background points\n", _num_background_points ); 00370 00371 Scalar prob; 00372 00373 int i = 0; 00374 00375 do { 00376 00377 Coord x,y; 00378 Sample s( env->getRandom( &x, &y ) ); 00379 00380 prob = model->getValue( s ); 00381 00382 for ( unsigned int j = 0; j < _thresholds.size(); j++ ) { 00383 00384 if ( prob < _thresholds[j] ) { 00385 00386 break; 00387 } 00388 00389 _proportions[j] += 1; 00390 } 00391 00392 ++i; 00393 00394 } while ( i < _num_background_points ); 00395 } 00396 } 00397 00398 if ( _approach == 2 ) { // proportional area 00399 00400 for ( unsigned int f = 0; f < _proportions.size(); f++ ) { 00401 00402 _proportions[f] /= _num_background_points; 00403 } 00404 } 00405 } 00406 00407 00408 /******************************/ 00409 /*** calculate Graph Points ***/ 00410 void RocCurve::_calculateGraphPoints() 00411 { 00412 int i, j, num_pairs = _prediction.size(); 00413 00414 if ( _approach == 2 ) { 00415 00416 Log::instance()->debug( "Using proportional area approach\n" ); 00417 } 00418 else { 00419 00420 Log::instance()->debug( "Using traditional ROC approach (presence x absence)\n" ); 00421 } 00422 00423 // Compute a specified number of data points for the graph 00424 for ( i = 0; i < _resolution; i++ ) { 00425 00426 // Define counters 00427 int num_tp = 0; // true positives 00428 int num_fp = 0; // false positives 00429 int num_tn = 0; // true negatives 00430 int num_fn = 0; // false negatives 00431 00432 // Positivity criterion for current point 00433 Scalar threshold = _thresholds[i]; 00434 00435 Log::instance()->debug( "Calculating ROC point for %f threshold\n", threshold ); 00436 00437 // Process all pairs 00438 for ( j = 0; j < num_pairs; j++ ) { 00439 00440 // Determine actual (binarized) and predicted value 00441 bool act_flag = ( _category[j] == 1 ); 00442 bool prd_flag = ( _prediction[j] >= threshold ); 00443 00444 // Increment counters 00445 if ( act_flag && prd_flag ) { 00446 00447 num_tp++; 00448 } 00449 else if ( !act_flag && !prd_flag ) { 00450 00451 num_tn++; 00452 } 00453 else if ( !act_flag && prd_flag ) { 00454 00455 num_fp++; 00456 } 00457 else { 00458 00459 num_fn++; 00460 } 00461 } 00462 00463 // Define counter sums 00464 int num_ap = num_tp + num_fn; // actual positives 00465 int num_pp = num_tp + num_fp; // predicted positives 00466 int num_an = num_tn + num_fp; // actual negatives 00467 int num_pn = num_tn + num_fn; // predicted negatives 00468 int num_tt = num_ap + num_an; // Total tally 00469 00470 _true_negatives = num_an; 00471 _true_positives = num_ap; 00472 00473 // Determine sensitivity, specificity, etc. 00474 Scalar sensitivity = (num_ap == 0) ? Scalar(-1) : Scalar(num_tp) / num_ap; 00475 Scalar specificity = (num_an == 0) ? Scalar(-1) : Scalar(num_tn) / num_an; 00476 Scalar ppvalue = (num_pp == 0) ? Scalar(-1) : Scalar(num_tp) / num_pp; 00477 Scalar npvalue = (num_pn == 0) ? Scalar(-1) : Scalar(num_tn) / num_pn; 00478 Scalar accuracy = (num_tt == 0) ? Scalar(-1) : Scalar(num_tp + num_tn) / num_tt; 00479 00480 // Ignore points with unknown sensitivity 00481 if ( sensitivity == -1 ) { 00482 00483 Log::instance()->debug( "Ignoring point with unknown sensitivity\n" ); 00484 continue; 00485 } 00486 00487 // Ignore points with unknown specificity if absence points were provided 00488 if ( specificity == -1 && _approach == 1 ) { 00489 00490 Log::instance()->debug( "Ignoring point with unknown specificity\n" ); 00491 continue; 00492 } 00493 00494 std::vector<Scalar> v; 00495 v.reserve(6); 00496 00497 // Store vector contents 00498 00499 // x value 00500 if ( _approach == 1 ) { 00501 00502 v.push_back(1 - specificity); 00503 Log::instance()->debug( "1 - specificity = %f\n", 1 - specificity ); 00504 } 00505 else { 00506 00507 v.push_back( _proportions[i] ); 00508 Log::instance()->debug( "Proportion = %f\n", _proportions[i] ); 00509 } 00510 00511 Log::instance()->debug( "Sensitivity = %f\n", sensitivity ); 00512 00513 // y value 00514 v.push_back(sensitivity); 00515 00516 v.push_back(ppvalue); 00517 v.push_back(npvalue); 00518 v.push_back(accuracy); 00519 v.push_back(threshold); 00520 00521 // Append to data vector 00522 _data.push_back(v); 00523 } 00524 00525 // Append (0, 0) artificially. 00526 std::vector<Scalar> v00; 00527 00528 v00.reserve(6); 00529 00530 v00.push_back(0.0); // 1 - specificity 00531 v00.push_back(0.0); // sensitivity 00532 v00.push_back(-1); // ppvalue 00533 v00.push_back(-1); // npvalue 00534 v00.push_back(-1); // accuracy 00535 v00.push_back(-1); // threshold 00536 00537 _data.push_back(v00); 00538 00539 if ( _approach == 1 ) { 00540 00541 // Append (1, 1) artificially. 00542 std::vector<Scalar> v11; 00543 00544 v11.reserve(6); 00545 00546 v11.push_back(1.0); // 1 - specificity 00547 v11.push_back(1.0); // sensitivity 00548 v11.push_back(-1); // ppvalue 00549 v11.push_back(-1); // npvalue 00550 v11.push_back(-1); // accuracy 00551 v11.push_back(-1); // threshold 00552 00553 _data.push_back(v11); 00554 } 00555 00556 VectorCompare compare; 00557 00558 // Sort ROC points. 00559 std::sort( _data.begin(), _data.end(), compare ); 00560 } 00561 00562 00563 /**********************/ 00564 /*** calculate Area ***/ 00565 bool RocCurve::_calculateTotalArea() 00566 { 00567 _auc = -1.0; 00568 00569 int i, num_points = numPoints(); 00570 00571 // Verify dimensions 00572 if ( num_points < 2 ) { 00573 00574 return false; 00575 } 00576 00577 double area = 0.0; 00578 00579 // Approximate area under ROC curve with trapezes 00580 for ( i = 1; i < num_points; i++ ) { 00581 00582 double x1 = getX(i - 1); 00583 double y1 = getY(i - 1); 00584 double x2 = getX(i); 00585 double y2 = getY(i); 00586 00587 if ( x2 != x1 ) { 00588 00589 area += (x2 - x1) * 0.5 * (y1 + y2); 00590 } 00591 } 00592 00593 _auc = area; 00594 00595 return true; 00596 } 00597 00598 00599 /**********************/ 00600 /*** get Total Area ***/ 00601 double 00602 RocCurve::getTotalArea() { 00603 00604 if ( _auc < 0.0 ) { 00605 00606 _calculateTotalArea(); 00607 } 00608 00609 return _auc; 00610 } 00611 00612 00613 /******************************/ 00614 /*** get Partial Area Ratio ***/ 00615 double RocCurve::getPartialAreaRatio( double e ) 00616 { 00617 Log::instance()->info( "Calculating partial area for limit %f\n", e ); 00618 00619 if ( e < 0.0 ) { 00620 00621 e = 0.0; 00622 } 00623 00624 if ( e > 1.0 ) { 00625 00626 e = 1.0; 00627 } 00628 00629 double area = 0.0; 00630 00631 double diag_area = 0.0; 00632 00633 int i, num_points = numPoints(); 00634 00635 // Verify dimensions 00636 if ( num_points < 2 ) { 00637 00638 return -1.0; 00639 } 00640 00641 map<double, double>::iterator it; 00642 00643 it = _ratios.find( e ); 00644 00645 if ( it != _ratios.end() ) { 00646 00647 return _ratios[e]; 00648 } 00649 00650 bool interpolate = true; 00651 00652 // Approximate area under ROC curve with trapezes 00653 for ( i = 1; i < num_points; i++ ) { 00654 00655 double x1 = getX(i - 1); 00656 double y1 = getY(i - 1); 00657 double x2 = getX(i); 00658 double y2 = getY(i); 00659 00660 // Only points where Y is greater than or equals 1-e (e=maximum accepted omission) 00661 if ( x2 != x1 ) { 00662 00663 if ( y1 == (1.0 - e) ) { 00664 00665 area += (x2 - x1) * 0.5 * (y1 + y2); 00666 00667 diag_area += (x2 - x1) * 0.5 * (x1 + x2); 00668 00669 interpolate = false; 00670 } 00671 else if ( y1 > (1.0 - e) ) { // y1 is in ascending order 00672 00673 if ( interpolate ) { 00674 00675 if ( i > 1 ) { 00676 00677 double x0 = getX(i - 2); 00678 double y0 = getY(i - 2); 00679 00680 double y = 1.0 - e; 00681 00682 double x; 00683 00684 if ( y1 == y0 ) { 00685 00686 x = x1 - x0; 00687 } 00688 else { 00689 00690 x = x1 - ((x1-x0)*(y1-y)/(y1-y0)); 00691 } 00692 00693 // Add missing previous area via interpolation 00694 area += (x1 - x) * 0.5 * (y + y1); 00695 00696 diag_area += (x1 - x) * 0.5 * (x + x1); 00697 00698 // Normal trapezoid 00699 area += (x2 - x1) * 0.5 * (y1 + y2); 00700 00701 diag_area += (x2 - x1) * 0.5 * (x1 + x2); 00702 00703 interpolate = false; 00704 } 00705 } 00706 else { 00707 00708 area += (x2 - x1) * 0.5 * (y1 + y2); 00709 00710 diag_area += (x2 - x1) * 0.5 * (x1 + x2); 00711 } 00712 } 00713 } 00714 } 00715 00716 Log::instance()->debug( "Partial area calculated as: %f / %f\n", area, diag_area ); 00717 00718 double ratio = area / diag_area; 00719 00720 #ifdef MSVC 00721 bool ratio_isnan = (_isnan(ratio) == 1) ? true : false; 00722 #else 00723 bool ratio_isnan = isnan(ratio); 00724 #endif 00725 if ( ratio_isnan ) { 00726 00727 ratio = 0.0; 00728 } 00729 00730 _ratios[e] = ratio; 00731 00732 return _ratios[e]; 00733 } 00734 00735 00736 /*************************/ 00737 /*** get Configuration ***/ 00738 ConfigurationPtr 00739 RocCurve::getConfiguration() const 00740 { 00741 ConfigurationPtr config( new ConfigurationImpl("RocCurve") ); 00742 00743 config->addNameValue( "Auc", _auc ); 00744 00745 if ( _approach == 2 ) { 00746 00747 config->addNameValue( "NumBackgroundPoints", _num_background_points ); 00748 } 00749 00750 int num_points = numPoints(); 00751 00752 double *tmp_points = new double[num_points*2]; 00753 00754 int cnt = 0; 00755 00756 for ( int i = 0; i < num_points; ++i, ++cnt ) { 00757 00758 tmp_points[cnt] = getX( i ); 00759 00760 ++cnt; 00761 00762 tmp_points[cnt] = getY( i ); 00763 } 00764 00765 config->addNameValue( "Points", tmp_points, num_points*2 ); 00766 00767 for ( map<double, double>::const_iterator it = _ratios.begin(); it != _ratios.end(); it++ ) { 00768 00769 ConfigurationPtr ratio( new ConfigurationImpl( "Ratio" ) ); 00770 00771 ratio->addNameValue( "E", (*it).first ); 00772 ratio->addNameValue( "Value", (*it).second ); 00773 00774 config->addSubsection( ratio ); 00775 } 00776 00777 delete[] tmp_points; 00778 00779 return config; 00780 }