openModeller  Version 1.5.0
rf_alg.cpp
Go to the documentation of this file.
1 
27 #include "rf_alg.hh"
28 
29 #include <openmodeller/Sampler.hh>
31 
32 #include <string.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <sstream>
36 
37 #include "librf/instance_set.h"
38 #include "librf/tree.h"
39 #include "librf/weights.h"
40 
41 using namespace std;
42 using namespace librf;
43 
44 /****************************************************************/
45 /********************** Algorithm's Metadata ********************/
46 
47 #define NUM_PARAM 3
48 
49 #define NUMTREES_ID "NumTrees"
50 #define K_ID "VarsPerTree"
51 #define UNSUP_ID "ForceUnsupervisedLearning"
52 
53 #define RF_LOG_PREFIX "RfAlgorithm: "
54 
55 /******************************/
56 /*** Algorithm's parameters ***/
57 
59 
60  // Number of Trees
61  {
62  NUMTREES_ID, // Id.
63  "Number of trees", // Name.
64  Integer, // Type.
65  "Number of trees", // Overview
66  "Number of trees", // Description.
67  1, // Not zero if the parameter has lower limit.
68  1, // Parameter's lower limit.
69  1, // Not zero if the parameter has upper limit.
70  1000, // Parameter's upper limit.
71  "10" // Parameter's typical (default) value.
72  },
73  // Number of variables per tree
74  {
75  K_ID, // Id.
76  "Number of variables per tree", // Name.
77  Integer, // Type.
78  "Number of variables per tree (zero defaults to the square root of the number of layers)", // Overview
79  "Number of variables per tree (zero defaults to the square root of the number of layers)", // Description.
80  0, // Not zero if the parameter has lower limit.
81  0, // Parameter's lower limit.
82  0, // Not zero if the parameter has upper limit.
83  0, // Parameter's upper limit.
84  "0" // Parameter's typical (default) value.
85  },
86  // Force unsupervised learning
87  {
88  UNSUP_ID, // Id.
89  "Force unsupervised learning", // Name.
90  Integer, // Type.
91  "Force unsupervised learning", // Overview
92  "When absence points are provided, this parameter can be used to ignore them forcing unsupervised learning. Note that if no absences are provided, unsupervised learning will be used anyway.", // Description.
93  1, // Not zero if the parameter has lower limit.
94  0, // Parameter's lower limit.
95  1, // Not zero if the parameter has upper limit.
96  1, // Parameter's upper limit.
97  "0" // Parameter's typical (default) value.
98  },
99 };
100 
101 /************************************/
102 /*** Algorithm's general metadata ***/
103 
105 
106  "RF", // Id.
107  "Random Forests", // Name.
108  "0.2", // Version.
109 
110  // Overview
111  "Random Forests",
112 
113  // Description.
114  "Random Forests",
115 
116  "Leo Breiman & Adele Cutler", // Algorithm author.
117  "Breiman, L. (2001). Random forests. Machine Learning, 45, 5-32.", // Bibliography.
118 
119  "Renato De Giovanni", // Code author.
120  "renato [at] cria . org . br", // Code author's contact.
121 
122  0, // Does not accept categorical data.
123  0, // Does not need (pseudo)absence points.
124 
125  NUM_PARAM, // Algorithm's parameters.
126  parameters
127 };
128 
129 /****************************************************************/
130 /****************** Algorithm's factory function ****************/
131 
132 OM_ALG_DLL_EXPORT
135 {
136  return new RfAlgorithm();
137 }
138 
139 OM_ALG_DLL_EXPORT
140 AlgMetadata const *
142 {
143  return &metadata;
144 }
145 
146 
147 /*********************************************/
148 /************** SVM algorithm ****************/
149 
150 /*******************/
151 /*** constructor ***/
152 
155  _done( false ),
156  _initialized( false )
157 {
158 }
159 
160 
161 /******************/
162 /*** destructor ***/
163 
165 {
166  if ( _initialized ) {
167 
168  delete _set;
169  }
170 
171  for ( unsigned int i = 0; i < _trees.size(); ++i ) {
172 
173  delete _trees[i];
174  }
175 }
176 
177 /**************************/
178 /*** need Normalization ***/
180 {
181  return 0;
182 }
183 
184 /******************/
185 /*** initialize ***/
186 int
188 {
189  int num_layers = _samp->numIndependent();
190 
191  // Number of trees
192  if ( ! getParameter( NUMTREES_ID, &_num_trees ) ) {
193 
194  Log::instance()->error( RF_LOG_PREFIX "Parameter '" NUMTREES_ID "' not passed.\n" );
195  return 0;
196  }
197 
198  if ( _num_trees < 1 ) {
199 
200  Log::instance()->error( RF_LOG_PREFIX "Parameter '" NUMTREES_ID "' must be greater than zero.\n" );
201  return 0;
202  }
203 
204  // Don't allow people to use too much memory
205  if ( _num_trees > 1000 ) {
206 
207  Log::instance()->error( RF_LOG_PREFIX "Parameter '" NUMTREES_ID "' is greater than 1000.\n" );
208  return 0;
209  }
210 
211  _trees.reserve( _num_trees );
212 
213  // Number of variables per tree
214  if ( ! getParameter( K_ID, &_k ) ) {
215 
216  Log::instance()->error( RF_LOG_PREFIX "Parameter '" K_ID "' not passed.\n" );
217  return 0;
218  }
219 
220  if ( _k < 1 ) {
221 
222  _k = int( sqrt( double( num_layers ) ) );
223  }
224 
225  // Unsupervised learning
226  bool force_unsupervised_learning = false;
227  int unsup;
228  if ( getParameter( UNSUP_ID, &unsup ) && unsup == 1 ) {
229 
230  force_unsupervised_learning = true;
231  }
232 
233  _class_weights.resize(2, 1);
234 
235  // Check the number of presences
236  int num_presences = _samp->numPresence();
237 
238  if ( num_presences == 0 ) {
239 
240  Log::instance()->warn( RF_LOG_PREFIX "No presence points inside the mask!\n" );
241  return 0;
242  }
243 
244  // Load input points
245 
246  unsigned int seed = (unsigned int)_rand.get();
247 
248  stringstream sdata("");
249  stringstream slabels("");
250 
253 
254  OccurrencesPtr presences = _samp->getPresences();
255 
256  p_iterator = presences->begin();
257  p_end = presences->end();
258 
259  while ( p_iterator != p_end ) {
260 
261  Sample presence = (*p_iterator)->environment();
262 
263  _sampleToLine( presence, sdata );
264 
265  slabels << "0" << endl; // presence
266 
267  ++p_iterator;
268  }
269 
270  if ( _samp->numAbsence() && ! force_unsupervised_learning ) {
271 
272  OccurrencesPtr absences = _samp->getAbsences();
273 
274  p_iterator = absences->begin();
275  p_end = absences->end();
276 
277  while ( p_iterator != p_end ) {
278 
279  Sample absence = (*p_iterator)->environment();
280 
281  _sampleToLine( absence, sdata );
282 
283  slabels << "1" << endl; // absence
284 
285  ++p_iterator;
286  }
287 
288  istream data( sdata.rdbuf() );
289  istream labels( slabels.rdbuf() );
290 
291  _set = InstanceSet::load_csv_and_labels( data, labels );
292  }
293  else {
294 
295  // Prepare for unsupervised learning
296 
297  istream data( sdata.rdbuf() );
298 
299  _set = InstanceSet::load_unsupervised( data, &seed );
300  }
301 
302  _initialized = true;
303 
304  return 1;
305 }
306 
307 
308 /**********************/
309 /*** sample to line ***/
310 void
311 RfAlgorithm::_sampleToLine( Sample sample, stringstream& ss ) const
312 {
313  for ( unsigned int j = 0; j < sample.size(); ++j ) {
314 
315  ss << sample[j] << ",";
316  }
317 
318  ss << endl;
319 }
320 
321 
322 /***************/
323 /*** iterate ***/
324 int
326 {
327  if ( (int)_trees.size() < _num_trees ) {
328 
329  weight_list* w = new weight_list( _set->size(), _set->size());
330 
331  // sample with replacement
332  for ( unsigned int j = 0; j < _set->size(); ++j ) {
333 
334  int instance = _rand.get( 0, _set->size() - 1 );
335  w->add( instance, _class_weights[_set->label(instance)] );
336  }
337 
338  Tree* tree = new Tree( *_set, w, _k, 1, 0, _rand.get(1000) );
339  tree->grow();
340 
341  _trees.push_back(tree);
342  }
343  else {
344 
345  _done = true;
346  }
347 
348  return 1;
349 }
350 
351 /********************/
352 /*** get Progress ***/
354 {
355  if ( done() ) {
356 
357  return 1.0;
358  }
359 
360  return (float)_trees.size() / (float)_num_trees;
361 }
362 
363 
364 /************/
365 /*** done ***/
366 int
368 {
369  return _done;
370 }
371 
372 /*****************/
373 /*** get Value ***/
374 Scalar
375 RfAlgorithm::getValue( const Sample& x ) const
376 {
377  stringstream sdata("");
378 
379  _sampleToLine( x, sdata );
380 
381  istream data( sdata.rdbuf() );
382 
383  stringstream slabels("0");
384 
385  istream labels( slabels.rdbuf() );
386 
387  InstanceSet* set = InstanceSet::load_csv_and_labels( data, labels );
388 
389  DiscreteDist votes;
390 
391  for ( unsigned int i = 0; i < _trees.size(); ++i ) {
392 
393  int predict = _trees[i]->predict( *set, 0 );
394  votes.add( predict );
395  }
396 
397  float prob = votes.percentage(0);
398 
399  delete set;
400 
401  return (double)prob;
402 }
403 
404 /***********************/
405 /*** get Convergence ***/
406 int
408 {
409  *val = 1.0;
410  return 1;
411 }
412 
413 /****************************************************************/
414 /****************** configuration *******************************/
415 void
417 {
418  if ( ! _done )
419  return;
420 
421  ConfigurationPtr model_config( new ConfigurationImpl("Rf") );
422  config->addSubsection( model_config );
423 
424  model_config->addNameValue( "Trees", _num_trees );
425  model_config->addNameValue( "K", _k );
426 
427  Tree* p_tree = NULL;
428  tree_node* p_node = NULL;
429 
430  unsigned int num_nodes;
431 
432  char buffer[5];
433 
434  for ( int i=0; i < _num_trees; ++i ) {
435 
436  p_tree = _trees[i];
437 
438  ConfigurationPtr tree_config( new ConfigurationImpl("Tree") );
439 
440  num_nodes = p_tree->num_nodes();
441 
442  tree_config->addNameValue( "Nodes", (int)num_nodes );
443 
444  sprintf( buffer, "%4.2f", p_tree->training_accuracy() );
445 
446  tree_config->addNameValue( "Accuracy", buffer );
447  tree_config->addNameValue( "Split", (int)p_tree->num_split_nodes() );
448  tree_config->addNameValue( "Terminal", (int)p_tree->num_terminal_nodes() );
449 
450  for ( unsigned int j= 0; j < num_nodes; ++j ) {
451 
452  p_node = p_tree->get_node( j );
453 
454  ConfigurationPtr node_config( new ConfigurationImpl("Node") );
455 
456  librf::NodeStatusType status = p_node->status;
457 
458  node_config->addNameValue( "Status", (int)status );
459 
460  if ( status == SPLIT ) {
461 
462  node_config->addNameValue( "L", (int)p_node->left );
463  node_config->addNameValue( "R", (int)p_node->right );
464  node_config->addNameValue( "A", (int)p_node->attr );
465  node_config->addNameValue( "S", (float)p_node->split_point );
466  }
467  else if ( status == TERMINAL ) {
468 
469  node_config->addNameValue( "V", (char)p_node->label );
470  }
471 
472  tree_config->addSubsection( node_config );
473  }
474 
475  model_config->addSubsection( tree_config );
476  }
477 }
478 
479 void
481 {
482  ConstConfigurationPtr model_config = config->getSubsection( "Rf", false );
483 
484  if ( ! model_config )
485  return;
486 
487  _num_trees = model_config->getAttributeAsInt( "Trees", 0 );
488 
489  _k = model_config->getAttributeAsInt( "K", 0 );
490 
491  _trees.reserve( _num_trees );
492 
493  Configuration::subsection_list trees = model_config->getAllSubsections();
494 
495  Configuration::subsection_list::iterator tree = trees.begin();
496  Configuration::subsection_list::iterator last_tree = trees.end();
497 
498  for ( ; tree != last_tree; ++tree ) {
499 
500  if ( (*tree)->getName() != "Tree" ) {
501 
502  continue;
503  }
504 
505  Tree* my_tree = new Tree();
506 
507  Configuration::subsection_list nodes = (*tree)->getAllSubsections();
508 
509  Configuration::subsection_list::iterator node = nodes.begin();
510  Configuration::subsection_list::iterator last_node = nodes.end();
511 
512  for ( ; node != last_node; ++node ) {
513 
514  if ( (*node)->getName() != "Node" ) {
515 
516  continue;
517  }
518 
519  int status = (*node)->getAttributeAsInt( "Status", 0 );
520 
521  tree_node my_node;
522 
523  if ( status == SPLIT ) {
524 
525  my_node.status = SPLIT;
526  my_node.left = (*node)->getAttributeAsInt( "L", 0 );
527  my_node.right = (*node)->getAttributeAsInt( "R", 0 );
528  my_node.attr = (*node)->getAttributeAsInt( "A", 0 );
529  double split_point = (*node)->getAttributeAsDouble( "S", 0.0 );
530  my_node.split_point = (float)split_point;
531  }
532  else if ( status == TERMINAL ) {
533 
534  my_node.status = TERMINAL;
535  int label = (*node)->getAttributeAsInt( "V", 0 );
536  my_node.label = uchar(label);
537  }
538  else {
539 
540  continue;
541  }
542 
543  my_tree->add_node( my_node );
544  }
545 
546  _trees.push_back( my_tree );
547  }
548 
549  _initialized = true;
550 
551  _done = true;
552 }
double get(double min, double max)
Definition: Random.cpp:54
Random _rand
Definition: rf_alg.hh:77
void warn(const char *format,...)
'Warn' level.
Definition: Log.cpp:273
Scalar getValue(const Sample &x) const
Definition: rf_alg.cpp:375
std::vector< ConfigurationPtr > subsection_list
void _setConfiguration(const ConstConfigurationPtr &)
Definition: rf_alg.cpp:480
OM_ALG_DLL_EXPORT AlgMetadata const * algorithmMetadata()
Definition: rf_alg.cpp:141
int _k
Definition: rf_alg.hh:73
double Scalar
Type of map values.
Definition: om_defs.hh:39
void _getConfiguration(ConfigurationPtr &) const
Definition: rf_alg.cpp:416
#define UNSUP_ID
Definition: rf_alg.cpp:51
vector< librf::Tree * > _trees
Definition: rf_alg.hh:83
static Log * instance()
Returns the instance pointer, creating the object on the first call.
Definition: Log.cpp:45
int needNormalization()
Definition: rf_alg.cpp:179
librf::InstanceSet * _set
Definition: rf_alg.hh:79
void error(const char *format,...)
'Error' level.
Definition: Log.cpp:290
float getProgress() const
Definition: rf_alg.cpp:353
~RfAlgorithm()
Definition: rf_alg.cpp:164
int getParameter(std::string const &name, std::string *value)
void _sampleToLine(Sample sample, stringstream &ss) const
Definition: rf_alg.cpp:311
OM_ALG_DLL_EXPORT AlgorithmImpl * algorithmFactory()
Definition: rf_alg.cpp:134
#define NUM_PARAM
Definition: rf_alg.cpp:47
std::size_t size() const
Definition: Sample.hh:70
vector< int > _class_weights
Definition: rf_alg.hh:81
int _num_trees
Definition: rf_alg.hh:71
int done() const
Definition: rf_alg.cpp:367
int getConvergence(Scalar *const val) const
Definition: rf_alg.cpp:407
SamplerPtr _samp
Definition: Algorithm.hh:245
#define K_ID
Definition: rf_alg.cpp:50
int iterate()
Definition: rf_alg.cpp:325
std::vector< OccurrencePtr >::const_iterator const_iterator
Definition: Occurrences.hh:85
static AlgParamMetadata parameters[NUM_PARAM]
Definition: rf_alg.cpp:58
#define RF_LOG_PREFIX
Definition: rf_alg.cpp:53
int initialize()
Definition: rf_alg.cpp:187
bool _initialized
Definition: rf_alg.hh:75
bool _done
Definition: rf_alg.hh:69
static AlgMetadata metadata
Definition: rf_alg.cpp:104
#define NUMTREES_ID
Definition: rf_alg.cpp:49
Definition: Sample.hh:25