openModeller  Version 1.4.0
RocCurve.cpp
Go to the documentation of this file.
00001 
00032 #include <stdio.h>
00033 
00034 #include <openmodeller/RocCurve.hh>
00035 #include <openmodeller/Sampler.hh>
00036 #include <openmodeller/Algorithm.hh>
00037 #include <openmodeller/Occurrences.hh>
00038 #include <openmodeller/Environment.hh>
00039 #include <openmodeller/Configuration.hh>
00040 #include <openmodeller/Log.hh>
00041 
00042 #include <openmodeller/Exceptions.hh>
00043 
00044 #include <string.h>
00045 #include <algorithm>
00046 #include <map>
00047 
00048 #include <math.h>
00049 #ifdef MSVC
00050 #include <float.h> //for _isnan
00051 #endif
00052 
00053 using namespace std;
00054 
00055 /*******************/
00056 /*** constructor ***/
00057 RocCurve::RocCurve()
00058 {
00059   initialize();
00060 }
00061 
00062 /*******************/
00063 /*** destructor ***/
00064 RocCurve::~RocCurve()
00065 {
00066 }
00067 
00068 /******************/
00069 /*** initialize ***/
00070 void RocCurve::initialize( int resolution )
00071 {
00072   _resolution = resolution;
00073   _approach = 0; // to be set later
00074   _num_background_points = ROC_DEFAULT_BACKGROUND_POINTS; // when no absences are provided
00075   _use_absences_as_background = false;
00076 }
00077 
00078 /******************/
00079 /*** initialize ***/
00080 void RocCurve::initialize( int resolution, int num_background_points )
00081 {
00082   _resolution = resolution;
00083   _approach = 2;
00084   _num_background_points = num_background_points;
00085   _use_absences_as_background = false;
00086 }
00087 
00088 /******************/
00089 /*** initialize ***/
00090 void RocCurve::initialize( int resolution, bool use_absences_as_background )
00091 {
00092   _resolution = resolution;
00093   _approach = 2;
00094   _num_background_points = 0; // to be set later (= number of absence points)
00095   _use_absences_as_background = use_absences_as_background;
00096 }
00097 
00098 /*************/
00099 /*** reset ***/
00100 void RocCurve::reset()
00101 {
00102   Log::instance()->debug( "Resetting ROC curve\n" );
00103 
00104   _ready = false;
00105   _category.erase( _category.begin(), _category.end() );
00106   _prediction.erase( _prediction.begin(), _prediction.end() );
00107   _data.erase( _data.begin(), _data.end() );
00108   _true_negatives = 0;
00109   _true_positives = 0;
00110   _auc = -1.0;
00111 
00112   _proportions.erase( _proportions.begin(), _proportions.end() );
00113   _proportions.reserve( _resolution );
00114 
00115   _thresholds.erase( _thresholds.begin(), _thresholds.end() );
00116   _thresholds.reserve( _resolution );
00117 
00118   // Compute thresholds
00119   for ( int i = 0; i < _resolution; i++ ) {
00120 
00121     _thresholds.push_back( Scalar(i) / ( _resolution - 1 ) );
00122     _proportions.push_back( 0 );
00123   }
00124 
00125   _ratios.clear();
00126 }
00127 
00128 
00129 /*****************/
00130 /*** calculate ***/
00131 void RocCurve::calculate( const Model& model, const SamplerPtr& sampler )
00132 {
00133   Log::instance()->info( "Calculating ROC curve\n" );
00134 
00135   reset();
00136 
00137   model->setNormalization( sampler );
00138 
00139   _loadPredictions( model, sampler );
00140 
00141   _calculateGraphPoints();
00142 
00143   _ready = true;
00144 }
00145 
00146 
00147 /************************/
00148 /*** load Predictions ***/
00149 void RocCurve::_loadPredictions( const Model& model, const SamplerPtr& sampler )
00150 {
00151   // Check parameters
00152 
00153   EnvironmentPtr env = sampler->getEnvironment();
00154 
00155   if ( ! env ) {
00156 
00157     std::string msg = "No environment specified for the ROC curve\n";
00158 
00159     Log::instance()->error( msg.c_str() );
00160     throw InvalidParameterException( msg );
00161   }
00162 
00163   OccurrencesPtr presences = sampler->getPresences();
00164   OccurrencesPtr absences = sampler->getAbsences();
00165 
00166   if ( ! presences ) {
00167 
00168     std::string msg = "No presence points specified for the ROC curve\n";
00169 
00170     Log::instance()->error( msg.c_str() );
00171     throw InvalidParameterException( msg );
00172   }
00173 
00174   // Set approach and check parameters
00175 
00176   int size = presences->numOccurrences();
00177 
00178   if ( absences && ! absences->isEmpty() ) {
00179 
00180     if ( _approach == 0 ) { // undefined
00181 
00182       _approach = 1; // traditional approach
00183     }
00184 
00185     int num_absences = absences->numOccurrences();
00186 
00187     if ( _approach == 1 ) { // traditional approach
00188 
00189       size += num_absences;
00190     }
00191     else {
00192 
00193       if ( _use_absences_as_background ) {
00194 
00195         if ( absences->isEmpty() ) {
00196 
00197           std::string msg = "Cannot use absences as background points for the ROC curve when no absences are provided\n";
00198 
00199           Log::instance()->error( msg.c_str() );
00200           throw InvalidParameterException( msg );
00201         }
00202 
00203         _num_background_points = num_absences;
00204       }
00205       else {
00206 
00207         Log::instance()->info( "Ignoring absences for the ROC curve\n" );
00208       }
00209     }
00210   }
00211   else {
00212 
00213     // No absences provided
00214 
00215     if ( _approach == 0 ) { // undefined
00216 
00217       _approach = 2; // proportional area
00218     }
00219     else if ( _approach == 1 ) { // traditional approach
00220 
00221       std::string msg = "Cannot calculate traditional ROC curve when no absences are provided\n";
00222 
00223       Log::instance()->error( msg.c_str() );
00224       throw InvalidParameterException( msg );
00225     }
00226 
00227     if ( _use_absences_as_background ) {
00228 
00229       std::string msg = "Cannot use absences as background points for the ROC curve when no absences are provided\n";
00230 
00231       Log::instance()->error( msg.c_str() );
00232       throw InvalidParameterException( msg );
00233     }
00234   }
00235 
00236   // Load predictions
00237 
00238   _category.reserve( size );
00239   _prediction.reserve( size );
00240 
00241   OccurrencesImpl::const_iterator it = presences->begin();
00242   OccurrencesImpl::const_iterator fin = presences->end();
00243 
00244   Scalar predictionValue;
00245 
00246   // Store predictions for presence points
00247   int i = 0;
00248   while( it != fin ) {
00249 
00250     Sample sample; 
00251 
00252     if ( (*it)->hasEnvironment() ) {
00253 
00254       sample = (*it)->environment();
00255     }
00256     else {
00257 
00258       sample = env->get( (*it)->x(), (*it)->y() );
00259     }
00260 
00261     if ( sample.size() > 0 ) {
00262 
00263       ++i;
00264 
00265       predictionValue = model->getValue( sample );
00266 
00267       _category.push_back( 1 );
00268       _prediction.push_back( predictionValue );
00269 
00270       Log::instance()->debug( "Probability for presence point %s (%f,%f): %f\n", 
00271                    ((*it)->id()).c_str(), (*it)->x(), (*it)->y(), predictionValue );
00272     }
00273     else {
00274 
00275       Log::instance()->warn( "Skipping point (%s) with no environmental data!\n", 
00276                    ((*it)->id()).c_str() );
00277     }
00278 
00279     ++it;
00280   }
00281 
00282   // Store predictions for absence points
00283   i = 0;
00284   if ( _approach == 1 ) {
00285 
00286     it = absences->begin();
00287     fin = absences->end();
00288 
00289     while( it != fin ) {
00290 
00291       Sample sample;
00292 
00293       if ( (*it)->hasEnvironment() ) {
00294 
00295         sample = (*it)->environment();
00296       }
00297       else {
00298 
00299         sample = env->get( (*it)->x(), (*it)->y() );
00300       }
00301 
00302       if ( sample.size() > 0 ) {
00303 
00304         ++i;
00305 
00306         predictionValue = model->getValue( sample );
00307 
00308         _category.push_back( 0 );
00309         _prediction.push_back( predictionValue );
00310 
00311         Log::instance()->debug( "Probability for absence point %s (%f,%f): %f\n", 
00312                      ((*it)->id()).c_str(), (*it)->x(), (*it)->y(), predictionValue );
00313       }
00314       else {
00315 
00316         Log::instance()->warn( "Skipping point (%s) with no environmental data!\n", 
00317                      ((*it)->id()).c_str() );
00318       }
00319 
00320       ++it;
00321     }
00322   }
00323   else {
00324 
00325     if ( _use_absences_as_background ) {
00326 
00327       Log::instance()->info( "Using %d absences as background for the ROC curve\n", _num_background_points );
00328 
00329       it = absences->begin();
00330       fin = absences->end();
00331 
00332       while( it != fin ) {
00333 
00334         Sample sample;
00335 
00336         if ( (*it)->hasEnvironment() ) {
00337 
00338           sample = (*it)->environment();
00339         }
00340         else {
00341 
00342           sample = env->get( (*it)->x(), (*it)->y() );
00343         }
00344 
00345         if ( sample.size() > 0 ) {
00346 
00347           ++i;
00348 
00349           predictionValue = model->getValue( sample );
00350 
00351           for ( unsigned int j = 0; j < _thresholds.size(); j++ ) {
00352 
00353             if ( predictionValue < _thresholds[j] ) {
00354 
00355               break;
00356             }
00357 
00358             _proportions[j] += 1;
00359           }
00360         }
00361 
00362         ++it;
00363       }
00364     }
00365     else {
00366 
00367       // Generate background points
00368 
00369       Log::instance()->info( "Generating %d background points\n", _num_background_points );
00370 
00371       Scalar prob;
00372 
00373       int i = 0;
00374 
00375       do {
00376 
00377         Coord x,y;
00378         Sample s( env->getRandom( &x, &y ) );
00379 
00380         prob = model->getValue( s );
00381 
00382         for ( unsigned int j = 0; j < _thresholds.size(); j++ ) {
00383 
00384           if ( prob < _thresholds[j] ) {
00385 
00386             break;
00387           }
00388 
00389           _proportions[j] += 1;
00390         }
00391 
00392         ++i;
00393 
00394       } while ( i < _num_background_points );
00395     }
00396   }
00397 
00398   if ( _approach == 2 ) { // proportional area
00399 
00400     for ( unsigned int f = 0; f < _proportions.size(); f++ ) {
00401 
00402       _proportions[f] /= _num_background_points;
00403     }
00404   }
00405 }
00406 
00407 
00408 /******************************/
00409 /*** calculate Graph Points ***/
00410 void RocCurve::_calculateGraphPoints()
00411 {
00412   int i, j, num_pairs = _prediction.size();
00413 
00414   if ( _approach == 2 ) {
00415 
00416     Log::instance()->debug( "Using proportional area approach\n" );
00417   }
00418   else {
00419 
00420     Log::instance()->debug( "Using traditional ROC approach (presence x absence)\n" );
00421   }
00422 
00423   // Compute a specified number of data points for the graph
00424   for ( i = 0; i < _resolution; i++ ) {
00425 
00426     // Define counters
00427     int num_tp = 0; // true positives
00428     int num_fp = 0; // false positives
00429     int num_tn = 0; // true negatives
00430     int num_fn = 0; // false negatives
00431 
00432     // Positivity criterion for current point
00433     Scalar threshold = _thresholds[i];
00434 
00435     Log::instance()->debug( "Calculating ROC point for %f threshold\n", threshold );
00436 
00437     // Process all pairs
00438     for ( j = 0; j < num_pairs; j++ ) {
00439 
00440       // Determine actual (binarized) and predicted value
00441       bool act_flag = ( _category[j] == 1 );
00442       bool prd_flag = ( _prediction[j] >= threshold );
00443 
00444       // Increment counters
00445       if ( act_flag && prd_flag ) {
00446 
00447         num_tp++;
00448       }
00449       else if ( !act_flag && !prd_flag ) {
00450 
00451         num_tn++;
00452       }
00453       else if ( !act_flag && prd_flag ) {
00454 
00455         num_fp++;
00456       }
00457       else {
00458 
00459         num_fn++;
00460       }
00461     }
00462 
00463     // Define counter sums
00464     int num_ap = num_tp + num_fn; // actual positives
00465     int num_pp = num_tp + num_fp; // predicted positives
00466     int num_an = num_tn + num_fp; // actual negatives
00467     int num_pn = num_tn + num_fn; // predicted negatives
00468     int num_tt = num_ap + num_an; // Total tally
00469 
00470     _true_negatives = num_an;
00471     _true_positives = num_ap;
00472 
00473     // Determine sensitivity, specificity, etc.
00474     Scalar sensitivity = (num_ap == 0) ? Scalar(-1) : Scalar(num_tp) / num_ap;
00475     Scalar specificity = (num_an == 0) ? Scalar(-1) : Scalar(num_tn) / num_an;
00476     Scalar ppvalue     = (num_pp == 0) ? Scalar(-1) : Scalar(num_tp) / num_pp;
00477     Scalar npvalue     = (num_pn == 0) ? Scalar(-1) : Scalar(num_tn) / num_pn;
00478     Scalar accuracy    = (num_tt == 0) ? Scalar(-1) : Scalar(num_tp + num_tn) / num_tt;
00479 
00480     // Ignore points with unknown sensitivity
00481     if ( sensitivity == -1 ) {
00482 
00483       Log::instance()->debug( "Ignoring point with unknown sensitivity\n" );
00484       continue;
00485     }
00486 
00487     // Ignore points with unknown specificity if absence points were provided
00488     if ( specificity == -1 && _approach == 1 ) {
00489 
00490       Log::instance()->debug( "Ignoring point with unknown specificity\n" );
00491       continue;
00492     }
00493 
00494     std::vector<Scalar> v;
00495     v.reserve(6);
00496 
00497     // Store vector contents
00498 
00499     // x value
00500     if ( _approach == 1 ) {
00501 
00502       v.push_back(1 - specificity);
00503       Log::instance()->debug( "1 - specificity = %f\n", 1 - specificity );
00504     }
00505     else {
00506 
00507       v.push_back( _proportions[i] );
00508       Log::instance()->debug( "Proportion = %f\n", _proportions[i] );
00509     }
00510 
00511     Log::instance()->debug( "Sensitivity = %f\n", sensitivity );
00512 
00513     // y value
00514     v.push_back(sensitivity);
00515 
00516     v.push_back(ppvalue);
00517     v.push_back(npvalue);
00518     v.push_back(accuracy);
00519     v.push_back(threshold);
00520 
00521     // Append to data vector
00522     _data.push_back(v);
00523   }
00524 
00525   // Append (0, 0) artificially.
00526   std::vector<Scalar> v00;
00527 
00528   v00.reserve(6);
00529 
00530   v00.push_back(0.0);  // 1 - specificity
00531   v00.push_back(0.0);  // sensitivity
00532   v00.push_back(-1);   // ppvalue
00533   v00.push_back(-1);   // npvalue
00534   v00.push_back(-1);   // accuracy
00535   v00.push_back(-1);   // threshold
00536 
00537   _data.push_back(v00);
00538 
00539   if ( _approach == 1 ) {
00540 
00541     // Append (1, 1) artificially.
00542     std::vector<Scalar> v11;
00543 
00544     v11.reserve(6);
00545 
00546     v11.push_back(1.0);  // 1 - specificity
00547     v11.push_back(1.0);  // sensitivity
00548     v11.push_back(-1);   // ppvalue
00549     v11.push_back(-1);   // npvalue
00550     v11.push_back(-1);   // accuracy
00551     v11.push_back(-1);   // threshold
00552 
00553     _data.push_back(v11);
00554   }
00555 
00556   VectorCompare compare;
00557 
00558   // Sort ROC points.
00559   std::sort( _data.begin(), _data.end(), compare );
00560 }
00561 
00562 
00563 /**********************/
00564 /*** calculate Area ***/
00565 bool RocCurve::_calculateTotalArea()
00566 {
00567   _auc = -1.0;
00568 
00569   int i, num_points = numPoints();
00570 
00571   // Verify dimensions
00572   if ( num_points < 2 ) {
00573 
00574     return false;
00575   }
00576 
00577   double area = 0.0;
00578 
00579   // Approximate area under ROC curve with trapezes
00580   for ( i = 1; i < num_points; i++ ) {
00581 
00582     double x1 = getX(i - 1);
00583     double y1 = getY(i - 1);
00584     double x2 = getX(i);
00585     double y2 = getY(i);
00586 
00587     if ( x2 != x1 ) {
00588 
00589       area += (x2 - x1) * 0.5 * (y1 + y2);
00590     }
00591   }
00592 
00593   _auc = area;
00594 
00595   return true;
00596 }
00597 
00598 
00599 /**********************/
00600 /*** get Total Area ***/
00601 double 
00602 RocCurve::getTotalArea() {
00603 
00604   if ( _auc < 0.0 ) {
00605 
00606     _calculateTotalArea();
00607   }
00608 
00609   return _auc; 
00610 }
00611 
00612 
00613 /******************************/
00614 /*** get Partial Area Ratio ***/
00615 double RocCurve::getPartialAreaRatio( double e )
00616 {
00617   Log::instance()->info( "Calculating partial area for limit %f\n", e );
00618 
00619   if ( e < 0.0 ) {
00620 
00621     e = 0.0;
00622   }
00623 
00624   if ( e > 1.0 ) {
00625 
00626     e = 1.0;
00627   }
00628 
00629   double area = 0.0;
00630 
00631   double diag_area = 0.0;
00632 
00633   int i, num_points = numPoints();
00634 
00635   // Verify dimensions
00636   if ( num_points < 2 ) {
00637 
00638     return -1.0;
00639   }
00640 
00641   map<double, double>::iterator it;
00642 
00643   it = _ratios.find( e );
00644 
00645   if ( it != _ratios.end() ) {
00646 
00647     return _ratios[e];
00648   }
00649 
00650   bool interpolate = true;
00651 
00652   // Approximate area under ROC curve with trapezes
00653   for ( i = 1; i < num_points; i++ ) {
00654 
00655     double x1 = getX(i - 1);
00656     double y1 = getY(i - 1);
00657     double x2 = getX(i);
00658     double y2 = getY(i);
00659 
00660     // Only points where Y is greater than or equals 1-e (e=maximum accepted omission)
00661     if ( x2 != x1 ) {
00662 
00663       if ( y1 == (1.0 - e) ) {
00664 
00665         area += (x2 - x1) * 0.5 * (y1 + y2);
00666 
00667         diag_area += (x2 - x1) * 0.5 * (x1 + x2);
00668 
00669         interpolate = false;
00670       }
00671       else if ( y1 > (1.0 - e) ) { // y1 is in ascending order
00672 
00673         if ( interpolate ) {
00674  
00675           if ( i > 1 ) {
00676 
00677             double x0 = getX(i - 2);
00678             double y0 = getY(i - 2);
00679 
00680             double y = 1.0 - e;
00681 
00682             double x;
00683 
00684             if ( y1 == y0 ) {
00685 
00686               x = x1 - x0;
00687             }
00688             else {
00689 
00690               x = x1 - ((x1-x0)*(y1-y)/(y1-y0));
00691             }
00692 
00693             // Add missing previous area via interpolation
00694             area += (x1 - x) * 0.5 * (y + y1);
00695 
00696             diag_area += (x1 - x) * 0.5 * (x + x1);
00697 
00698             // Normal trapezoid
00699             area += (x2 - x1) * 0.5 * (y1 + y2);
00700 
00701             diag_area += (x2 - x1) * 0.5 * (x1 + x2);
00702 
00703             interpolate = false;
00704           }
00705         }
00706         else {
00707  
00708           area += (x2 - x1) * 0.5 * (y1 + y2);
00709 
00710           diag_area += (x2 - x1) * 0.5 * (x1 + x2);
00711         }
00712       }
00713     }
00714   }
00715 
00716   Log::instance()->debug( "Partial area calculated as: %f / %f\n", area, diag_area );
00717 
00718   double ratio = area / diag_area;
00719 
00720 #ifdef MSVC
00721   bool ratio_isnan = (_isnan(ratio) == 1) ? true : false;
00722 #else
00723   bool ratio_isnan = isnan(ratio);
00724 #endif
00725   if ( ratio_isnan ) {
00726 
00727     ratio = 0.0;
00728   }
00729 
00730   _ratios[e] = ratio;
00731 
00732   return _ratios[e];
00733 }
00734 
00735 
00736 /*************************/
00737 /*** get Configuration ***/
00738 ConfigurationPtr 
00739 RocCurve::getConfiguration() const
00740 {
00741   ConfigurationPtr config( new ConfigurationImpl("RocCurve") );
00742 
00743   config->addNameValue( "Auc", _auc );
00744 
00745   if ( _approach == 2 ) {
00746 
00747     config->addNameValue( "NumBackgroundPoints", _num_background_points );
00748   }
00749 
00750   int num_points = numPoints();
00751 
00752   double *tmp_points = new double[num_points*2];
00753 
00754   int cnt = 0;
00755 
00756   for ( int i = 0; i < num_points; ++i, ++cnt ) {
00757 
00758     tmp_points[cnt] = getX( i );
00759 
00760     ++cnt;
00761 
00762     tmp_points[cnt] = getY( i );
00763   }
00764 
00765   config->addNameValue( "Points", tmp_points, num_points*2 );
00766 
00767   for ( map<double, double>::const_iterator it = _ratios.begin(); it != _ratios.end(); it++ ) {
00768 
00769     ConfigurationPtr ratio( new ConfigurationImpl( "Ratio" ) );
00770 
00771     ratio->addNameValue( "E", (*it).first );
00772     ratio->addNameValue( "Value", (*it).second );
00773 
00774     config->addSubsection( ratio );
00775   }
00776 
00777   delete[] tmp_points;
00778 
00779   return config;
00780 }