openModeller
Version 1.4.0
|
00001 00027 #include <openmodeller/pre/PreJackknife.hh> 00028 #include <openmodeller/Sampler.hh> 00029 #include <openmodeller/Algorithm.hh> 00030 #include <openmodeller/Occurrences.hh> 00031 #include <openmodeller/Environment.hh> 00032 #include <openmodeller/ConfusionMatrix.hh> 00033 #include <openmodeller/Log.hh> 00034 00035 #include <openmodeller/Exceptions.hh> 00036 00037 #include <string.h> 00038 #include <math.h> 00039 00040 using namespace std; 00041 00042 /*******************/ 00043 /*** constructor ***/ 00044 PreJackknife::PreJackknife() 00045 { 00046 } 00047 00048 00049 /*******************/ 00050 /*** destructor ***/ 00051 PreJackknife::~PreJackknife() 00052 { 00053 } 00054 00055 bool 00056 PreJackknife::checkParameters( const PreParameters& parameters ) const 00057 { 00058 SamplerPtr samplerPtr; 00059 00060 if ( ! parameters.retrieve( "Sampler", samplerPtr ) ) { 00061 00062 Log::instance()->error( "Missing parameter: Sampler. \n" ); 00063 return false; 00064 } 00065 00066 AlgorithmPtr algorithmPtr; 00067 00068 if ( ! parameters.retrieve( "Algorithm", algorithmPtr ) ) { 00069 00070 Log::instance()->error( "Missing parameter: Algorithm. \n" ); 00071 return false; 00072 } 00073 00074 return true; 00075 } 00076 00077 void 00078 PreJackknife::getAcceptedParameters( stringMap& info) 00079 { 00080 info["Sampler"] = "samplerPtr"; 00081 info["Algorithm"] = "algorithmPtr"; 00082 info["PropTrain"] = "double"; 00083 } 00084 00085 void 00086 PreJackknife::getLayersetResultSpec ( stringMap& info) 00087 { 00088 info["Accuracy"] = "double"; 00089 info["Mean"] = "double"; 00090 info["Variance"] = "double"; 00091 info["Deviation"] = "double"; 00092 info["Estimate"] = "double"; 00093 info["Bias"] = "double"; 00094 } 00095 00096 void 00097 PreJackknife::getLayerResultSpec ( stringMap& info) 00098 { 00099 info["Accuracy without layer"] = "double"; 00100 } 00101 00102 bool PreJackknife::runImplementation() 00103 { 00104 Log::instance()->debug( "Running jackknife\n" ); 00105 00106 SamplerPtr samplerPtr; 00107 params_.retrieve( "Sampler", samplerPtr ); 00108 00109 AlgorithmPtr algorithmPtr; 00110 params_.retrieve( "Algorithm", algorithmPtr ); 00111 00112 double propTrain; 00113 00114 if ( ! params_.retrieve( "PropTrain", propTrain ) ) { 00115 00116 // default 00117 propTrain = 0.9; 00118 } 00119 00120 if ( ! samplerPtr->getEnvironment() ) { 00121 00122 std::string msg = "Sampler has no environment.\n"; 00123 00124 Log::instance()->error( msg.c_str() ); 00125 00126 throw InvalidParameterException( msg ); 00127 } 00128 00129 int num_layers = samplerPtr->numIndependent(); 00130 00131 if ( num_layers < 2 ) { 00132 00133 std::string msg = "Jackknife needs at least 2 layers.\n"; 00134 00135 Log::instance()->error( msg.c_str() ); 00136 00137 throw InvalidParameterException( msg ); 00138 } 00139 00140 // Split sampler into test and trainning 00141 SamplerPtr training_sampler; 00142 SamplerPtr testing_sampler; 00143 00144 splitSampler( samplerPtr, &training_sampler, &testing_sampler, propTrain ); 00145 00146 // Calculate reference parameter using all layers 00147 AlgorithmPtr algorithm_ptr = algorithmPtr->getFreshCopy(); 00148 00149 algorithm_ptr->createModel( training_sampler ); 00150 00151 ConfusionMatrix conf_matrix; 00152 00153 conf_matrix.calculate( algorithm_ptr->getModel(), testing_sampler ); 00154 00155 double out_param = conf_matrix.getAccuracy() * 100; 00156 00157 // Calculate reference parameter for each layer by excluding it from the layer set 00158 00159 std::multimap<double, int> out_params; 00160 00161 double mean = 0.0; 00162 double variance = 0.0; 00163 double std_deviation = 0.0; 00164 double jackknife_estimate = 0.0; 00165 double jackknife_bias = 0.0; 00166 00167 // Work with clones of the occurrences 00168 OccurrencesPtr training_presences; 00169 OccurrencesPtr training_absences; 00170 OccurrencesPtr testing_presences; 00171 OccurrencesPtr testing_absences; 00172 00173 if ( training_sampler->numPresence() ) { 00174 00175 training_presences = training_sampler->getPresences()->clone(); 00176 } 00177 00178 if ( training_sampler->numAbsence() ) { 00179 00180 training_absences = training_sampler->getAbsences()->clone(); 00181 } 00182 00183 if ( testing_sampler->numPresence() ) { 00184 00185 testing_presences = testing_sampler->getPresences()->clone(); 00186 } 00187 00188 if ( testing_sampler->numAbsence() ) { 00189 00190 testing_absences = testing_sampler->getAbsences()->clone(); 00191 } 00192 00193 for ( int i = 0; i < num_layers; ++i ) { 00194 00195 Log::instance()->debug( "Removing layer with index %u\n", i ); 00196 00197 // Copy the original environment 00198 EnvironmentPtr new_environment = samplerPtr->getEnvironment()->clone(); 00199 00200 PreParameters result; 00201 00202 // Remove one of the layers 00203 new_environment->removeLayer( i ); 00204 00205 // Read environment data from the new set of layers 00206 if ( training_presences ) { 00207 00208 training_presences->setEnvironment( new_environment ); 00209 } 00210 00211 if ( training_absences ) { 00212 00213 training_absences->setEnvironment( new_environment ); 00214 } 00215 00216 if ( testing_presences ) { 00217 00218 testing_presences->setEnvironment( new_environment ); 00219 } 00220 00221 if ( testing_absences ) { 00222 00223 testing_absences->setEnvironment( new_environment ); 00224 } 00225 00226 // Create a new sampler for trainning points 00227 SamplerPtr new_training_sampler = createSampler( new_environment, training_presences, training_absences ); 00228 00229 // Create a new algorithm 00230 AlgorithmPtr new_algorithm = algorithmPtr->getFreshCopy(); 00231 00232 new_algorithm->createModel( new_training_sampler ); 00233 00234 conf_matrix.reset(); 00235 00236 // Create a new sampler for testing points 00237 SamplerPtr new_testing_sampler = createSampler( new_environment, testing_presences, testing_absences ); 00238 00239 // Normalize test samples if necessary 00240 if ( new_algorithm->needNormalization() && ! new_testing_sampler->isNormalized() ) { 00241 00242 Log::instance()->info( "Computing normalization for test points\n"); 00243 00244 Normalizer * normalizer = new_algorithm->getNormalizer(); 00245 00246 if ( normalizer ) { 00247 00248 // Note: normalization parameters should have been already computed during model creation 00249 new_testing_sampler->normalize( normalizer ); 00250 } 00251 else { 00252 00253 Log::instance()->error( "Jackknife algorithm requires normalization but did not specify any normalizer\n"); 00254 return false; 00255 } 00256 } 00257 00258 // Calculate parameters 00259 conf_matrix.reset(); // reuse object 00260 conf_matrix.calculate( new_algorithm->getModel(), new_testing_sampler ); 00261 00262 double myaccuracy = conf_matrix.getAccuracy() * 100; 00263 00264 mean += myaccuracy; 00265 00266 out_params.insert( std::pair<double, int>( myaccuracy, i ) ); 00267 00268 result.store( "Accuracy without layer", myaccuracy ); 00269 00270 result_by_layer_[samplerPtr->getEnvironment()->getLayerPath(i)] = result; 00271 00272 // Code for debugging: 00273 00274 // string file_name = "model_"; 00275 // char num[4]; 00276 // sprintf( num, "%d", i); 00277 // file_name.append( num ); 00278 00279 // ConfigurationPtr config( new ConfigurationImpl("SerializedModel")); 00280 // ConfigurationPtr sampler_config( new_sampler->getConfiguration() ); 00281 // config->addSubsection( sampler_config ); 00282 // ConfigurationPtr alg_config( new_algorithm->getConfiguration() ); 00283 // config->addSubsection( alg_config ); 00284 00285 // std::ostringstream model_output; 00286 // Configuration::writeXml( config, model_output ); 00287 00288 // std::ofstream file( file_name.c_str() ); 00289 // file << model_output.str(); 00290 // file.close(); 00291 00292 // break; 00293 } 00294 00295 Log::instance()->debug( "Accuracy with all layers: %.2f%%\n", out_param ); 00296 00297 EnvironmentPtr environment_ptr = samplerPtr->getEnvironment(); 00298 00299 mean /= num_layers; 00300 00301 std::multimap<double, int>::const_iterator it = out_params.begin(); 00302 std::multimap<double, int>::const_iterator end = out_params.end(); 00303 for ( ; it != end; ++it ) { 00304 00305 Log::instance()->debug( "Accuracy without layer %d: %.2f%% (%s)\n", (*it).second, (*it).first, (environment_ptr->getLayerPath( (*it).second )).c_str() ); 00306 variance += ((*it).first - mean)*((*it).first - mean); 00307 } 00308 00309 Log::instance()->debug( "Mean = %f\n", mean ); 00310 00311 variance /= num_layers; 00312 00313 variance /= (num_layers - 1); 00314 00315 Log::instance()->debug( "Variance = %f\n", variance ); 00316 00317 std_deviation = sqrt(variance); 00318 00319 Log::instance()->debug( "Standard deviation = %f\n", std_deviation ); 00320 00321 jackknife_bias = (num_layers - 1)*(mean - out_param); 00322 00323 jackknife_estimate = out_param - jackknife_bias; 00324 00325 Log::instance()->debug( "Jackknife estimate = %f\n", jackknife_estimate ); 00326 00327 Log::instance()->debug( "Jackknife bias = %f\n", jackknife_bias ); 00328 00329 params_.store( "Accuracy", out_param ); 00330 params_.store( "Mean", mean ); 00331 params_.store( "Variance", variance ); 00332 params_.store( "Deviation", std_deviation ); 00333 params_.store( "Estimate", jackknife_estimate ); 00334 params_.store( "Bias", jackknife_bias ); 00335 00336 00337 00338 return true; 00339 }