openModeller  Version 1.5.0
PreJackknife.cpp
Go to the documentation of this file.
1 
28 #include <openmodeller/Sampler.hh>
33 #include <openmodeller/Log.hh>
34 
36 
37 #include <string.h>
38 #include <math.h>
39 
40 using namespace std;
41 
42 /*******************/
43 /*** constructor ***/
45 {
46 }
47 
48 
49 /*******************/
50 /*** destructor ***/
52 {
53 }
54 
55 bool
57 {
58  SamplerPtr samplerPtr;
59 
60  if ( ! parameters.retrieve( "Sampler", samplerPtr ) ) {
61 
62  Log::instance()->error( "Missing parameter: Sampler. \n" );
63  return false;
64  }
65 
66  AlgorithmPtr algorithmPtr;
67 
68  if ( ! parameters.retrieve( "Algorithm", algorithmPtr ) ) {
69 
70  Log::instance()->error( "Missing parameter: Algorithm. \n" );
71  return false;
72  }
73 
74  return true;
75 }
76 
77 void
79 {
80  info["Sampler"] = "samplerPtr";
81  info["Algorithm"] = "algorithmPtr";
82  info["PropTrain"] = "double";
83 }
84 
85 void
87 {
88  info["Accuracy"] = "double";
89  info["Mean"] = "double";
90  info["Variance"] = "double";
91  info["Deviation"] = "double";
92  info["Estimate"] = "double";
93  info["Bias"] = "double";
94 }
95 
96 void
98 {
99  info["Accuracy without layer"] = "double";
100 }
101 
103 {
104  Log::instance()->debug( "Running jackknife\n" );
105 
106  SamplerPtr samplerPtr;
107  params_.retrieve( "Sampler", samplerPtr );
108 
109  AlgorithmPtr algorithmPtr;
110  params_.retrieve( "Algorithm", algorithmPtr );
111 
112  double propTrain;
113 
114  if ( ! params_.retrieve( "PropTrain", propTrain ) ) {
115 
116  // default
117  propTrain = 0.9;
118  }
119 
120  if ( ! samplerPtr->getEnvironment() ) {
121 
122  std::string msg = "Sampler has no environment.\n";
123 
124  Log::instance()->error( msg.c_str() );
125 
126  throw InvalidParameterException( msg );
127  }
128 
129  int num_layers = samplerPtr->numIndependent();
130 
131  if ( num_layers < 2 ) {
132 
133  std::string msg = "Jackknife needs at least 2 layers.\n";
134 
135  Log::instance()->error( msg.c_str() );
136 
137  throw InvalidParameterException( msg );
138  }
139 
140  // Split sampler into test and trainning
141  SamplerPtr training_sampler;
142  SamplerPtr testing_sampler;
143 
144  splitSampler( samplerPtr, &training_sampler, &testing_sampler, propTrain );
145 
146  // Calculate reference parameter using all layers
147  AlgorithmPtr algorithm_ptr = algorithmPtr->getFreshCopy();
148 
149  algorithm_ptr->createModel( training_sampler );
150 
151  ConfusionMatrix conf_matrix;
152 
153  conf_matrix.calculate( algorithm_ptr->getModel(), testing_sampler );
154 
155  double out_param = conf_matrix.getAccuracy() * 100;
156 
157  // Calculate reference parameter for each layer by excluding it from the layer set
158 
159  std::multimap<double, int> out_params;
160 
161  double mean = 0.0;
162  double variance = 0.0;
163  double std_deviation = 0.0;
164  double jackknife_estimate = 0.0;
165  double jackknife_bias = 0.0;
166 
167  // Work with clones of the occurrences
168  OccurrencesPtr training_presences;
169  OccurrencesPtr training_absences;
170  OccurrencesPtr testing_presences;
171  OccurrencesPtr testing_absences;
172 
173  if ( training_sampler->numPresence() ) {
174 
175  training_presences = training_sampler->getPresences()->clone();
176  }
177 
178  if ( training_sampler->numAbsence() ) {
179 
180  training_absences = training_sampler->getAbsences()->clone();
181  }
182 
183  if ( testing_sampler->numPresence() ) {
184 
185  testing_presences = testing_sampler->getPresences()->clone();
186  }
187 
188  if ( testing_sampler->numAbsence() ) {
189 
190  testing_absences = testing_sampler->getAbsences()->clone();
191  }
192 
193  for ( int i = 0; i < num_layers; ++i ) {
194 
195  Log::instance()->debug( "Removing layer with index %u\n", i );
196 
197  // Copy the original environment
198  EnvironmentPtr new_environment = samplerPtr->getEnvironment()->clone();
199 
200  PreParameters result;
201 
202  // Remove one of the layers
203  new_environment->removeLayer( i );
204 
205  // Read environment data from the new set of layers
206  if ( training_presences ) {
207 
208  training_presences->setEnvironment( new_environment );
209  }
210 
211  if ( training_absences ) {
212 
213  training_absences->setEnvironment( new_environment );
214  }
215 
216  if ( testing_presences ) {
217 
218  testing_presences->setEnvironment( new_environment );
219  }
220 
221  if ( testing_absences ) {
222 
223  testing_absences->setEnvironment( new_environment );
224  }
225 
226  // Create a new sampler for trainning points
227  SamplerPtr new_training_sampler = createSampler( new_environment, training_presences, training_absences );
228 
229  // Create a new algorithm
230  AlgorithmPtr new_algorithm = algorithmPtr->getFreshCopy();
231 
232  new_algorithm->createModel( new_training_sampler );
233 
234  conf_matrix.reset();
235 
236  // Create a new sampler for testing points
237  SamplerPtr new_testing_sampler = createSampler( new_environment, testing_presences, testing_absences );
238 
239  // Normalize test samples if necessary
240  if ( new_algorithm->needNormalization() && ! new_testing_sampler->isNormalized() ) {
241 
242  Log::instance()->info( "Computing normalization for test points\n");
243 
244  Normalizer * normalizer = new_algorithm->getNormalizer();
245 
246  if ( normalizer ) {
247 
248  // Note: normalization parameters should have been already computed during model creation
249  new_testing_sampler->normalize( normalizer );
250  }
251  else {
252 
253  Log::instance()->error( "Jackknife algorithm requires normalization but did not specify any normalizer\n");
254  return false;
255  }
256  }
257 
258  // Calculate parameters
259  conf_matrix.reset(); // reuse object
260  conf_matrix.calculate( new_algorithm->getModel(), new_testing_sampler );
261 
262  double myaccuracy = conf_matrix.getAccuracy() * 100;
263 
264  mean += myaccuracy;
265 
266  out_params.insert( std::pair<double, int>( myaccuracy, i ) );
267 
268  result.store( "Accuracy without layer", myaccuracy );
269 
270  result_by_layer_[samplerPtr->getEnvironment()->getLayerPath(i)] = result;
271 
272 // Code for debugging:
273 
274 // string file_name = "model_";
275 // char num[4];
276 // sprintf( num, "%d", i);
277 // file_name.append( num );
278 
279 // ConfigurationPtr config( new ConfigurationImpl("SerializedModel"));
280 // ConfigurationPtr sampler_config( new_sampler->getConfiguration() );
281 // config->addSubsection( sampler_config );
282 // ConfigurationPtr alg_config( new_algorithm->getConfiguration() );
283 // config->addSubsection( alg_config );
284 
285 // std::ostringstream model_output;
286 // Configuration::writeXml( config, model_output );
287 
288 // std::ofstream file( file_name.c_str() );
289 // file << model_output.str();
290 // file.close();
291 
292 // break;
293  }
294 
295  Log::instance()->debug( "Accuracy with all layers: %.2f%%\n", out_param );
296 
297  EnvironmentPtr environment_ptr = samplerPtr->getEnvironment();
298 
299  mean /= num_layers;
300 
301  std::multimap<double, int>::const_iterator it = out_params.begin();
302  std::multimap<double, int>::const_iterator end = out_params.end();
303  for ( ; it != end; ++it ) {
304 
305  Log::instance()->debug( "Accuracy without layer %d: %.2f%% (%s)\n", (*it).second, (*it).first, (environment_ptr->getLayerPath( (*it).second )).c_str() );
306  variance += ((*it).first - mean)*((*it).first - mean);
307  }
308 
309  Log::instance()->debug( "Mean = %f\n", mean );
310 
311  variance /= num_layers;
312 
313  variance /= (num_layers - 1);
314 
315  Log::instance()->debug( "Variance = %f\n", variance );
316 
317  std_deviation = sqrt(variance);
318 
319  Log::instance()->debug( "Standard deviation = %f\n", std_deviation );
320 
321  jackknife_bias = (num_layers - 1)*(mean - out_param);
322 
323  jackknife_estimate = out_param - jackknife_bias;
324 
325  Log::instance()->debug( "Jackknife estimate = %f\n", jackknife_estimate );
326 
327  Log::instance()->debug( "Jackknife bias = %f\n", jackknife_bias );
328 
329  params_.store( "Accuracy", out_param );
330  params_.store( "Mean", mean );
331  params_.store( "Variance", variance );
332  params_.store( "Deviation", std_deviation );
333  params_.store( "Estimate", jackknife_estimate );
334  params_.store( "Bias", jackknife_bias );
335 
336 
337 
338  return true;
339 }
void reset(Scalar predictionThreshold=CONF_MATRIX_DEFAULT_THRESHOLD, bool ignoreAbsences=false)
void getLayersetResultSpec(stringMap &info)
double getAccuracy() const
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
std::map< string, string > stringMap
Definition: PreAlgorithm.hh:91
SamplerPtr createSampler(const EnvironmentPtr &env, const OccurrencesPtr &presence, const OccurrencesPtr &absence)
Definition: Sampler.cpp:52
void error(const char *format,...)
'Error' level.
Definition: Log.cpp:290
bool retrieve(const PreMultiContainerKeyT &obj_key, ObjectT &obj_reference) const
void store(const PreMultiContainerKeyT &obj_key, const ObjectT &obj_reference)
void getLayerResultSpec(stringMap &info)
void calculate(const EnvironmentPtr &env, const Model &model, const OccurrencesPtr &presences, const OccurrencesPtr &absences=OccurrencesPtr())
void splitSampler(const SamplerPtr &orig, SamplerPtr *train, SamplerPtr *test, double propTrain)
Definition: Sampler.cpp:1171
bool checkParameters(const PreParameters &parameters) const
void info(const char *format,...)
'Info' level.
Definition: Log.cpp:256
AlgParamMetadata parameters[NUM_PARAM]
Definition: garp.cpp:55
void getAcceptedParameters(stringMap &info)
bool runImplementation()
void debug(const char *format,...)
'Debug' level.
Definition: Log.cpp:237