openModeller  Version 1.5.0
RocCurve.cpp
Go to the documentation of this file.
1 
32 #include <stdio.h>
33 
34 #include <openmodeller/RocCurve.hh>
35 #include <openmodeller/Sampler.hh>
40 #include <openmodeller/Log.hh>
41 
43 
44 #include <string.h>
45 #include <algorithm>
46 #include <map>
47 
48 #include <math.h>
49 #ifdef MSVC
50 #include <float.h> //for _isnan
51 #endif
52 
53 using namespace std;
54 
55 /*******************/
56 /*** constructor ***/
58 {
59  initialize();
60 }
61 
62 /*******************/
63 /*** destructor ***/
65 {
66 }
67 
68 /******************/
69 /*** initialize ***/
70 void RocCurve::initialize( int resolution )
71 {
72  _resolution = resolution;
73  _approach = 0; // to be set later
74  _num_background_points = ROC_DEFAULT_BACKGROUND_POINTS; // when no absences are provided
75  _use_absences_as_background = false;
76 }
77 
78 /******************/
79 /*** initialize ***/
80 void RocCurve::initialize( int resolution, int num_background_points )
81 {
82  _resolution = resolution;
83  _approach = 2;
84  _num_background_points = num_background_points;
85  _use_absences_as_background = false;
86 }
87 
88 /******************/
89 /*** initialize ***/
90 void RocCurve::initialize( int resolution, bool use_absences_as_background )
91 {
92  _resolution = resolution;
93  _approach = 2;
94  _num_background_points = 0; // to be set later (= number of absence points)
95  _use_absences_as_background = use_absences_as_background;
96 }
97 
98 /*************/
99 /*** reset ***/
101 {
102  Log::instance()->debug( "Resetting ROC curve\n" );
103 
104  _ready = false;
105  _category.erase( _category.begin(), _category.end() );
106  _prediction.erase( _prediction.begin(), _prediction.end() );
107  _data.erase( _data.begin(), _data.end() );
108  _true_negatives = 0;
109  _true_positives = 0;
110  _auc = -1.0;
111 
112  _proportions.erase( _proportions.begin(), _proportions.end() );
113  _proportions.reserve( _resolution );
114 
115  _thresholds.erase( _thresholds.begin(), _thresholds.end() );
116  _thresholds.reserve( _resolution );
117 
118  // Compute thresholds
119  for ( int i = 0; i < _resolution; i++ ) {
120 
121  _thresholds.push_back( Scalar(i) / ( _resolution - 1 ) );
122  _proportions.push_back( 0 );
123  }
124 
125  _ratios.clear();
126 }
127 
128 
129 /*****************/
130 /*** calculate ***/
131 void RocCurve::calculate( const Model& model, const SamplerPtr& sampler )
132 {
133  Log::instance()->info( "Calculating ROC curve\n" );
134 
135  reset();
136 
137  model->setNormalization( sampler );
138 
139  _loadPredictions( model, sampler );
140 
141  _calculateGraphPoints();
142 
143  _ready = true;
144 }
145 
146 
147 /************************/
148 /*** load Predictions ***/
149 void RocCurve::_loadPredictions( const Model& model, const SamplerPtr& sampler )
150 {
151  // Check parameters
152 
153  EnvironmentPtr env = sampler->getEnvironment();
154 
155  if ( ! env ) {
156 
157  std::string msg = "No environment specified for the ROC curve\n";
158 
159  Log::instance()->error( msg.c_str() );
160  throw InvalidParameterException( msg );
161  }
162 
163  OccurrencesPtr presences = sampler->getPresences();
164  OccurrencesPtr absences = sampler->getAbsences();
165 
166  if ( ! presences ) {
167 
168  std::string msg = "No presence points specified for the ROC curve\n";
169 
170  Log::instance()->error( msg.c_str() );
171  throw InvalidParameterException( msg );
172  }
173 
174  // Set approach and check parameters
175 
176  int size = presences->numOccurrences();
177 
178  if ( absences && ! absences->isEmpty() ) {
179 
180  if ( _approach == 0 ) { // undefined
181 
182  _approach = 1; // traditional approach
183  }
184 
185  int num_absences = absences->numOccurrences();
186 
187  if ( _approach == 1 ) { // traditional approach
188 
189  size += num_absences;
190  }
191  else {
192 
193  if ( _use_absences_as_background ) {
194 
195  if ( absences->isEmpty() ) {
196 
197  std::string msg = "Cannot use absences as background points for the ROC curve when no absences are provided\n";
198 
199  Log::instance()->error( msg.c_str() );
200  throw InvalidParameterException( msg );
201  }
202 
203  _num_background_points = num_absences;
204  }
205  else {
206 
207  Log::instance()->info( "Ignoring absences for the ROC curve\n" );
208  }
209  }
210  }
211  else {
212 
213  // No absences provided
214 
215  if ( _approach == 0 ) { // undefined
216 
217  _approach = 2; // proportional area
218  }
219  else if ( _approach == 1 ) { // traditional approach
220 
221  std::string msg = "Cannot calculate traditional ROC curve when no absences are provided\n";
222 
223  Log::instance()->error( msg.c_str() );
224  throw InvalidParameterException( msg );
225  }
226 
227  if ( _use_absences_as_background ) {
228 
229  std::string msg = "Cannot use absences as background points for the ROC curve when no absences are provided\n";
230 
231  Log::instance()->error( msg.c_str() );
232  throw InvalidParameterException( msg );
233  }
234  }
235 
236  // Load predictions
237 
238  _category.reserve( size );
239  _prediction.reserve( size );
240 
241  OccurrencesImpl::const_iterator it = presences->begin();
242  OccurrencesImpl::const_iterator fin = presences->end();
243 
244  Scalar predictionValue;
245 
246  // Store predictions for presence points
247  int i = 0;
248  while( it != fin ) {
249 
250  Sample sample;
251 
252  if ( (*it)->hasEnvironment() ) {
253 
254  sample = (*it)->environment();
255  }
256  else {
257 
258  sample = env->get( (*it)->x(), (*it)->y() );
259  }
260 
261  if ( sample.size() > 0 ) {
262 
263  ++i;
264 
265  predictionValue = model->getValue( sample );
266 
267  _category.push_back( 1 );
268  _prediction.push_back( predictionValue );
269 
270  Log::instance()->debug( "Probability for presence point %s (%f,%f): %f\n",
271  ((*it)->id()).c_str(), (*it)->x(), (*it)->y(), predictionValue );
272  }
273  else {
274 
275  Log::instance()->warn( "Skipping point (%s) with no environmental data!\n",
276  ((*it)->id()).c_str() );
277  }
278 
279  ++it;
280  }
281 
282  // Store predictions for absence points
283  i = 0;
284  if ( _approach == 1 ) {
285 
286  it = absences->begin();
287  fin = absences->end();
288 
289  while( it != fin ) {
290 
291  Sample sample;
292 
293  if ( (*it)->hasEnvironment() ) {
294 
295  sample = (*it)->environment();
296  }
297  else {
298 
299  sample = env->get( (*it)->x(), (*it)->y() );
300  }
301 
302  if ( sample.size() > 0 ) {
303 
304  ++i;
305 
306  predictionValue = model->getValue( sample );
307 
308  _category.push_back( 0 );
309  _prediction.push_back( predictionValue );
310 
311  Log::instance()->debug( "Probability for absence point %s (%f,%f): %f\n",
312  ((*it)->id()).c_str(), (*it)->x(), (*it)->y(), predictionValue );
313  }
314  else {
315 
316  Log::instance()->warn( "Skipping point (%s) with no environmental data!\n",
317  ((*it)->id()).c_str() );
318  }
319 
320  ++it;
321  }
322  }
323  else {
324 
325  if ( _use_absences_as_background ) {
326 
327  Log::instance()->info( "Using %d absences as background for the ROC curve\n", _num_background_points );
328 
329  it = absences->begin();
330  fin = absences->end();
331 
332  while( it != fin ) {
333 
334  Sample sample;
335 
336  if ( (*it)->hasEnvironment() ) {
337 
338  sample = (*it)->environment();
339  }
340  else {
341 
342  sample = env->get( (*it)->x(), (*it)->y() );
343  }
344 
345  if ( sample.size() > 0 ) {
346 
347  ++i;
348 
349  predictionValue = model->getValue( sample );
350 
351  for ( unsigned int j = 0; j < _thresholds.size(); j++ ) {
352 
353  if ( predictionValue < _thresholds[j] ) {
354 
355  break;
356  }
357 
358  _proportions[j] += 1;
359  }
360  }
361 
362  ++it;
363  }
364  }
365  else {
366 
367  // Generate background points
368 
369  Log::instance()->info( "Generating %d background points\n", _num_background_points );
370 
371  Scalar prob;
372 
373  int i = 0;
374 
375  do {
376 
377  Coord x,y;
378  Sample s( env->getRandom( &x, &y ) );
379 
380  prob = model->getValue( s );
381 
382  for ( unsigned int j = 0; j < _thresholds.size(); j++ ) {
383 
384  if ( prob < _thresholds[j] ) {
385 
386  break;
387  }
388 
389  _proportions[j] += 1;
390  }
391 
392  ++i;
393 
394  } while ( i < _num_background_points );
395  }
396  }
397 
398  if ( _approach == 2 ) { // proportional area
399 
400  for ( unsigned int f = 0; f < _proportions.size(); f++ ) {
401 
402  _proportions[f] /= _num_background_points;
403  }
404  }
405 }
406 
407 
408 /******************************/
409 /*** calculate Graph Points ***/
411 {
412  int i, j, num_pairs = _prediction.size();
413 
414  if ( _approach == 2 ) {
415 
416  Log::instance()->debug( "Using proportional area approach\n" );
417  }
418  else {
419 
420  Log::instance()->debug( "Using traditional ROC approach (presence x absence)\n" );
421  }
422 
423  // Compute a specified number of data points for the graph
424  for ( i = 0; i < _resolution; i++ ) {
425 
426  // Define counters
427  int num_tp = 0; // true positives
428  int num_fp = 0; // false positives
429  int num_tn = 0; // true negatives
430  int num_fn = 0; // false negatives
431 
432  // Positivity criterion for current point
433  Scalar threshold = _thresholds[i];
434 
435  Log::instance()->debug( "Calculating ROC point for %f threshold\n", threshold );
436 
437  // Process all pairs
438  for ( j = 0; j < num_pairs; j++ ) {
439 
440  // Determine actual (binarized) and predicted value
441  bool act_flag = ( _category[j] == 1 );
442  bool prd_flag = ( _prediction[j] >= threshold );
443 
444  // Increment counters
445  if ( act_flag && prd_flag ) {
446 
447  num_tp++;
448  }
449  else if ( !act_flag && !prd_flag ) {
450 
451  num_tn++;
452  }
453  else if ( !act_flag && prd_flag ) {
454 
455  num_fp++;
456  }
457  else {
458 
459  num_fn++;
460  }
461  }
462 
463  // Define counter sums
464  int num_ap = num_tp + num_fn; // actual positives
465  int num_pp = num_tp + num_fp; // predicted positives
466  int num_an = num_tn + num_fp; // actual negatives
467  int num_pn = num_tn + num_fn; // predicted negatives
468  int num_tt = num_ap + num_an; // Total tally
469 
470  _true_negatives = num_an;
471  _true_positives = num_ap;
472 
473  // Determine sensitivity, specificity, etc.
474  Scalar sensitivity = (num_ap == 0) ? Scalar(-1) : Scalar(num_tp) / num_ap;
475  Scalar specificity = (num_an == 0) ? Scalar(-1) : Scalar(num_tn) / num_an;
476  Scalar ppvalue = (num_pp == 0) ? Scalar(-1) : Scalar(num_tp) / num_pp;
477  Scalar npvalue = (num_pn == 0) ? Scalar(-1) : Scalar(num_tn) / num_pn;
478  Scalar accuracy = (num_tt == 0) ? Scalar(-1) : Scalar(num_tp + num_tn) / num_tt;
479 
480  // Ignore points with unknown sensitivity
481  if ( sensitivity == -1 ) {
482 
483  Log::instance()->debug( "Ignoring point with unknown sensitivity\n" );
484  continue;
485  }
486 
487  // Ignore points with unknown specificity if absence points were provided
488  if ( specificity == -1 && _approach == 1 ) {
489 
490  Log::instance()->debug( "Ignoring point with unknown specificity\n" );
491  continue;
492  }
493 
494  std::vector<Scalar> v;
495  v.reserve(6);
496 
497  // Store vector contents
498 
499  // x value
500  if ( _approach == 1 ) {
501 
502  v.push_back(1 - specificity);
503  Log::instance()->debug( "1 - specificity = %f\n", 1 - specificity );
504  }
505  else {
506 
507  v.push_back( _proportions[i] );
508  Log::instance()->debug( "Proportion = %f\n", _proportions[i] );
509  }
510 
511  Log::instance()->debug( "Sensitivity = %f\n", sensitivity );
512 
513  // y value
514  v.push_back(sensitivity);
515 
516  v.push_back(ppvalue);
517  v.push_back(npvalue);
518  v.push_back(accuracy);
519  v.push_back(threshold);
520 
521  // Append to data vector
522  _data.push_back(v);
523  }
524 
525  // Append (0, 0) artificially.
526  std::vector<Scalar> v00;
527 
528  v00.reserve(6);
529 
530  v00.push_back(0.0); // 1 - specificity
531  v00.push_back(0.0); // sensitivity
532  v00.push_back(-1); // ppvalue
533  v00.push_back(-1); // npvalue
534  v00.push_back(-1); // accuracy
535  v00.push_back(-1); // threshold
536 
537  _data.push_back(v00);
538 
539  if ( _approach == 1 ) {
540 
541  // Append (1, 1) artificially.
542  std::vector<Scalar> v11;
543 
544  v11.reserve(6);
545 
546  v11.push_back(1.0); // 1 - specificity
547  v11.push_back(1.0); // sensitivity
548  v11.push_back(-1); // ppvalue
549  v11.push_back(-1); // npvalue
550  v11.push_back(-1); // accuracy
551  v11.push_back(-1); // threshold
552 
553  _data.push_back(v11);
554  }
555 
556  VectorCompare compare;
557 
558  // Sort ROC points.
559  std::sort( _data.begin(), _data.end(), compare );
560 }
561 
562 
563 /**********************/
564 /*** calculate Area ***/
566 {
567  _auc = -1.0;
568 
569  int i, num_points = numPoints();
570 
571  // Verify dimensions
572  if ( num_points < 2 ) {
573 
574  return false;
575  }
576 
577  double area = 0.0;
578 
579  // Approximate area under ROC curve with trapezes
580  for ( i = 1; i < num_points; i++ ) {
581 
582  double x1 = getX(i - 1);
583  double y1 = getY(i - 1);
584  double x2 = getX(i);
585  double y2 = getY(i);
586 
587  if ( x2 != x1 ) {
588 
589  area += (x2 - x1) * 0.5 * (y1 + y2);
590  }
591  }
592 
593  _auc = area;
594 
595  return true;
596 }
597 
598 
599 /**********************/
600 /*** get Total Area ***/
601 double
603 
604  if ( _auc < 0.0 ) {
605 
606  _calculateTotalArea();
607  }
608 
609  return _auc;
610 }
611 
612 
613 /******************************/
614 /*** get Partial Area Ratio ***/
616 {
617  Log::instance()->info( "Calculating partial area for limit %f\n", e );
618 
619  if ( e < 0.0 ) {
620 
621  e = 0.0;
622  }
623 
624  if ( e > 1.0 ) {
625 
626  e = 1.0;
627  }
628 
629  double area = 0.0;
630 
631  double diag_area = 0.0;
632 
633  int i, num_points = numPoints();
634 
635  // Verify dimensions
636  if ( num_points < 2 ) {
637 
638  return -1.0;
639  }
640 
641  map<double, double>::iterator it;
642 
643  it = _ratios.find( e );
644 
645  if ( it != _ratios.end() ) {
646 
647  return _ratios[e];
648  }
649 
650  bool interpolate = true;
651 
652  // Approximate area under ROC curve with trapezes
653  for ( i = 1; i < num_points; i++ ) {
654 
655  double x1 = getX(i - 1);
656  double y1 = getY(i - 1);
657  double x2 = getX(i);
658  double y2 = getY(i);
659 
660  // Only points where Y is greater than or equals 1-e (e=maximum accepted omission)
661  if ( x2 != x1 ) {
662 
663  if ( y1 == (1.0 - e) ) {
664 
665  area += (x2 - x1) * 0.5 * (y1 + y2);
666 
667  diag_area += (x2 - x1) * 0.5 * (x1 + x2);
668 
669  interpolate = false;
670  }
671  else if ( y1 > (1.0 - e) ) { // y1 is in ascending order
672 
673  if ( interpolate ) {
674 
675  if ( i > 1 ) {
676 
677  double x0 = getX(i - 2);
678  double y0 = getY(i - 2);
679 
680  double y = 1.0 - e;
681 
682  double x;
683 
684  if ( y1 == y0 ) {
685 
686  x = x1 - x0;
687  }
688  else {
689 
690  x = x1 - ((x1-x0)*(y1-y)/(y1-y0));
691  }
692 
693  // Add missing previous area via interpolation
694  area += (x1 - x) * 0.5 * (y + y1);
695 
696  diag_area += (x1 - x) * 0.5 * (x + x1);
697 
698  // Normal trapezoid
699  area += (x2 - x1) * 0.5 * (y1 + y2);
700 
701  diag_area += (x2 - x1) * 0.5 * (x1 + x2);
702 
703  interpolate = false;
704  }
705  }
706  else {
707 
708  area += (x2 - x1) * 0.5 * (y1 + y2);
709 
710  diag_area += (x2 - x1) * 0.5 * (x1 + x2);
711  }
712  }
713  }
714  }
715 
716  Log::instance()->debug( "Partial area calculated as: %f / %f\n", area, diag_area );
717 
718  double ratio = area / diag_area;
719 
720 #ifdef MSVC
721  bool ratio_isnan = (_isnan(ratio) == 1) ? true : false;
722 #else
723  bool ratio_isnan = isnan(ratio);
724 #endif
725  if ( ratio_isnan ) {
726 
727  ratio = 0.0;
728  }
729 
730  _ratios[e] = ratio;
731 
732  return _ratios[e];
733 }
734 
735 
736 /*************************/
737 /*** get Configuration ***/
740 {
741  ConfigurationPtr config( new ConfigurationImpl("RocCurve") );
742 
743  config->addNameValue( "Auc", _auc );
744 
745  if ( _approach == 2 ) {
746 
747  config->addNameValue( "NumBackgroundPoints", _num_background_points );
748  }
749 
750  int num_points = numPoints();
751 
752  double *tmp_points = new double[num_points*2];
753 
754  int cnt = 0;
755 
756  for ( int i = 0; i < num_points; ++i, ++cnt ) {
757 
758  tmp_points[cnt] = getX( i );
759 
760  ++cnt;
761 
762  tmp_points[cnt] = getY( i );
763  }
764 
765  config->addNameValue( "Points", tmp_points, num_points*2 );
766 
767  for ( map<double, double>::const_iterator it = _ratios.begin(); it != _ratios.end(); it++ ) {
768 
769  ConfigurationPtr ratio( new ConfigurationImpl( "Ratio" ) );
770 
771  ratio->addNameValue( "E", (*it).first );
772  ratio->addNameValue( "Value", (*it).second );
773 
774  config->addSubsection( ratio );
775  }
776 
777  delete[] tmp_points;
778 
779  return config;
780 }
void initialize(int resolution=ROC_DEFAULT_RESOLUTION)
Definition: RocCurve.cpp:70
void warn(const char *format,...)
'Warn' level.
Definition: Log.cpp:273
~RocCurve()
Definition: RocCurve.cpp:64
double Scalar
Type of map values.
Definition: om_defs.hh:39
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
void error(const char *format,...)
'Error' level.
Definition: Log.cpp:290
void _loadPredictions(const Model &model, const SamplerPtr &sampler)
Definition: RocCurve.cpp:149
#define ROC_DEFAULT_BACKGROUND_POINTS
Definition: RocCurve.hh:41
void reset()
Definition: RocCurve.cpp:100
std::size_t size() const
Definition: Sample.hh:70
void info(const char *format,...)
'Info' level.
Definition: Log.cpp:256
void calculate(const Model &model, const SamplerPtr &sampler)
Definition: RocCurve.cpp:131
ConfigurationPtr getConfiguration() const
Definition: RocCurve.cpp:739
double Coord
Type of map coordinates.
Definition: om_defs.hh:38
std::vector< OccurrencePtr >::const_iterator const_iterator
Definition: Occurrences.hh:85
RocCurve()
Definition: RocCurve.cpp:57
double getTotalArea()
Definition: RocCurve.cpp:602
bool _calculateTotalArea()
Definition: RocCurve.cpp:565
void debug(const char *format,...)
'Debug' level.
Definition: Log.cpp:237
double getPartialAreaRatio(double e=1.0)
Definition: RocCurve.cpp:615
void _calculateGraphPoints()
Definition: RocCurve.cpp:410
Definition: Sample.hh:25