openModeller  Version 1.4.0
PreJackknife.cpp
Go to the documentation of this file.
00001 
00027 #include <openmodeller/pre/PreJackknife.hh>
00028 #include <openmodeller/Sampler.hh>
00029 #include <openmodeller/Algorithm.hh>
00030 #include <openmodeller/Occurrences.hh>
00031 #include <openmodeller/Environment.hh>
00032 #include <openmodeller/ConfusionMatrix.hh>
00033 #include <openmodeller/Log.hh>
00034 
00035 #include <openmodeller/Exceptions.hh>
00036 
00037 #include <string.h>
00038 #include <math.h>
00039 
00040 using namespace std;
00041 
00042 /*******************/
00043 /*** constructor ***/
00044 PreJackknife::PreJackknife()
00045 {
00046 }
00047 
00048 
00049 /*******************/
00050 /*** destructor ***/
00051 PreJackknife::~PreJackknife()
00052 {
00053 }
00054 
00055 bool 
00056 PreJackknife::checkParameters( const PreParameters& parameters ) const
00057 {
00058   SamplerPtr samplerPtr;
00059 
00060   if ( ! parameters.retrieve( "Sampler", samplerPtr ) ) {
00061 
00062     Log::instance()->error( "Missing parameter: Sampler. \n" );
00063     return false;
00064   }
00065 
00066   AlgorithmPtr algorithmPtr;
00067 
00068   if ( ! parameters.retrieve( "Algorithm", algorithmPtr ) ) {
00069 
00070     Log::instance()->error( "Missing parameter: Algorithm. \n" );
00071     return false;
00072   }
00073 
00074   return true;
00075 }
00076 
00077 void
00078 PreJackknife::getAcceptedParameters( stringMap& info)
00079 {
00080   info["Sampler"] = "samplerPtr";
00081   info["Algorithm"] = "algorithmPtr";
00082   info["PropTrain"] = "double";
00083 }
00084 
00085 void
00086 PreJackknife::getLayersetResultSpec ( stringMap& info)
00087 {
00088   info["Accuracy"] = "double";
00089   info["Mean"] = "double";
00090   info["Variance"] = "double";
00091   info["Deviation"] = "double";
00092   info["Estimate"] = "double";
00093   info["Bias"] = "double";
00094 }
00095 
00096 void
00097 PreJackknife::getLayerResultSpec ( stringMap& info)
00098 {
00099   info["Accuracy without layer"] = "double";
00100 }
00101 
00102 bool PreJackknife::runImplementation()
00103 {
00104   Log::instance()->debug( "Running jackknife\n" );
00105 
00106   SamplerPtr samplerPtr;
00107   params_.retrieve( "Sampler", samplerPtr );
00108 
00109   AlgorithmPtr algorithmPtr;
00110   params_.retrieve( "Algorithm", algorithmPtr );
00111 
00112   double propTrain;
00113 
00114   if ( ! params_.retrieve( "PropTrain", propTrain ) ) {
00115 
00116     // default
00117     propTrain = 0.9;
00118   }
00119 
00120   if ( ! samplerPtr->getEnvironment() ) {
00121 
00122     std::string msg = "Sampler has no environment.\n";
00123 
00124     Log::instance()->error( msg.c_str() );
00125 
00126     throw InvalidParameterException( msg );
00127   }
00128 
00129   int num_layers = samplerPtr->numIndependent();
00130 
00131   if ( num_layers < 2 ) {
00132 
00133     std::string msg = "Jackknife needs at least 2 layers.\n";
00134 
00135     Log::instance()->error( msg.c_str() );
00136 
00137     throw InvalidParameterException( msg );
00138   }
00139 
00140   // Split sampler into test and trainning
00141   SamplerPtr training_sampler;
00142   SamplerPtr testing_sampler;
00143 
00144   splitSampler( samplerPtr, &training_sampler, &testing_sampler, propTrain );
00145 
00146   // Calculate reference parameter using all layers
00147   AlgorithmPtr algorithm_ptr = algorithmPtr->getFreshCopy();
00148   
00149   algorithm_ptr->createModel( training_sampler );
00150 
00151   ConfusionMatrix conf_matrix;
00152 
00153   conf_matrix.calculate( algorithm_ptr->getModel(), testing_sampler );
00154 
00155   double out_param = conf_matrix.getAccuracy() * 100;
00156 
00157   // Calculate reference parameter for each layer by excluding it from the layer set
00158 
00159   std::multimap<double, int> out_params;
00160 
00161   double mean = 0.0;
00162   double variance = 0.0;
00163   double std_deviation = 0.0;
00164   double jackknife_estimate = 0.0;
00165   double jackknife_bias = 0.0;
00166 
00167   // Work with clones of the occurrences 
00168   OccurrencesPtr training_presences;
00169   OccurrencesPtr training_absences;
00170   OccurrencesPtr testing_presences;
00171   OccurrencesPtr testing_absences;
00172 
00173   if ( training_sampler->numPresence() ) {
00174 
00175     training_presences = training_sampler->getPresences()->clone();
00176   }
00177 
00178   if ( training_sampler->numAbsence() ) {
00179 
00180     training_absences = training_sampler->getAbsences()->clone();
00181   }
00182 
00183   if ( testing_sampler->numPresence() ) {
00184 
00185     testing_presences = testing_sampler->getPresences()->clone();
00186   }
00187 
00188   if ( testing_sampler->numAbsence() ) {
00189 
00190     testing_absences = testing_sampler->getAbsences()->clone();
00191   }
00192 
00193   for ( int i = 0; i < num_layers; ++i ) {
00194 
00195     Log::instance()->debug( "Removing layer with index %u\n", i );
00196 
00197     // Copy the original environment
00198     EnvironmentPtr new_environment = samplerPtr->getEnvironment()->clone();
00199 
00200     PreParameters result;
00201 
00202     // Remove one of the layers
00203     new_environment->removeLayer( i );
00204 
00205     // Read environment data from the new set of layers
00206     if ( training_presences ) {
00207 
00208       training_presences->setEnvironment( new_environment );
00209     }
00210 
00211     if ( training_absences ) {
00212 
00213       training_absences->setEnvironment( new_environment );
00214     }
00215 
00216     if ( testing_presences ) {
00217 
00218       testing_presences->setEnvironment( new_environment );
00219     }
00220 
00221     if ( testing_absences ) {
00222 
00223       testing_absences->setEnvironment( new_environment );
00224     }
00225 
00226     // Create a new sampler for trainning points
00227     SamplerPtr new_training_sampler = createSampler( new_environment, training_presences, training_absences );
00228 
00229     // Create a new algorithm
00230     AlgorithmPtr new_algorithm = algorithmPtr->getFreshCopy();
00231 
00232     new_algorithm->createModel( new_training_sampler );
00233 
00234     conf_matrix.reset();
00235 
00236     // Create a new sampler for testing points
00237     SamplerPtr new_testing_sampler = createSampler( new_environment, testing_presences, testing_absences );
00238 
00239     // Normalize test samples if necessary
00240     if ( new_algorithm->needNormalization() && ! new_testing_sampler->isNormalized() ) {
00241 
00242     Log::instance()->info( "Computing normalization for test points\n");
00243 
00244     Normalizer * normalizer = new_algorithm->getNormalizer();
00245 
00246       if ( normalizer ) {
00247 
00248         // Note: normalization parameters should have been already computed during model creation
00249         new_testing_sampler->normalize( normalizer );
00250       }
00251       else {
00252 
00253         Log::instance()->error( "Jackknife algorithm requires normalization but did not specify any normalizer\n");
00254         return false;
00255       }
00256     }
00257 
00258     // Calculate parameters
00259     conf_matrix.reset(); // reuse object
00260     conf_matrix.calculate( new_algorithm->getModel(), new_testing_sampler );
00261 
00262     double myaccuracy = conf_matrix.getAccuracy() * 100;
00263 
00264     mean += myaccuracy;
00265 
00266     out_params.insert( std::pair<double, int>( myaccuracy, i ) );
00267 
00268     result.store( "Accuracy without layer", myaccuracy );
00269 
00270     result_by_layer_[samplerPtr->getEnvironment()->getLayerPath(i)] = result;
00271 
00272 // Code for debugging:
00273 
00274 //     string file_name = "model_";
00275 //     char num[4];
00276 //     sprintf( num, "%d", i);
00277 //     file_name.append( num );
00278 
00279 //     ConfigurationPtr config( new ConfigurationImpl("SerializedModel"));
00280 //     ConfigurationPtr sampler_config( new_sampler->getConfiguration() );
00281 //     config->addSubsection( sampler_config );
00282 //     ConfigurationPtr alg_config( new_algorithm->getConfiguration() );
00283 //     config->addSubsection( alg_config );
00284 
00285 //     std::ostringstream model_output;
00286 //     Configuration::writeXml( config, model_output );
00287 
00288 //     std::ofstream file( file_name.c_str() );
00289 //     file << model_output.str();
00290 //     file.close();
00291 
00292 //     break;
00293   }
00294 
00295   Log::instance()->debug( "Accuracy with all layers: %.2f%%\n", out_param );
00296 
00297   EnvironmentPtr environment_ptr = samplerPtr->getEnvironment();
00298   
00299   mean /= num_layers;
00300   
00301   std::multimap<double, int>::const_iterator it = out_params.begin();
00302   std::multimap<double, int>::const_iterator end = out_params.end();
00303   for ( ; it != end; ++it ) {
00304 
00305     Log::instance()->debug( "Accuracy without layer %d: %.2f%% (%s)\n", (*it).second, (*it).first, (environment_ptr->getLayerPath( (*it).second )).c_str() );
00306     variance += ((*it).first - mean)*((*it).first - mean);
00307   }
00308 
00309   Log::instance()->debug( "Mean = %f\n", mean );
00310 
00311   variance /= num_layers;
00312 
00313   variance /= (num_layers - 1);
00314 
00315   Log::instance()->debug( "Variance = %f\n", variance );
00316 
00317   std_deviation = sqrt(variance);
00318 
00319   Log::instance()->debug( "Standard deviation = %f\n", std_deviation );
00320 
00321   jackknife_bias = (num_layers - 1)*(mean - out_param);
00322 
00323   jackknife_estimate = out_param - jackknife_bias;
00324 
00325   Log::instance()->debug( "Jackknife estimate = %f\n", jackknife_estimate );
00326 
00327   Log::instance()->debug( "Jackknife bias = %f\n", jackknife_bias );
00328 
00329   params_.store( "Accuracy", out_param );
00330   params_.store( "Mean", mean );
00331   params_.store( "Variance", variance );
00332   params_.store( "Deviation", std_deviation );
00333   params_.store( "Estimate", jackknife_estimate );
00334   params_.store( "Bias", jackknife_bias );
00335 
00336 
00337 
00338   return true;
00339 }