openModeller  Version 1.4.0
TapirOccurrences.cpp
Go to the documentation of this file.
00001 
00027 #include <openmodeller/occ_io/TapirOccurrences.hh>
00028 
00029 #include <openmodeller/Occurrences.hh>
00030 
00031 #include <openmodeller/Exceptions.hh>
00032 
00033 #include <stdlib.h>
00034 
00035 #include <string.h>
00036 using std::string;
00037 
00038 #include <sstream>
00039 using std::ostringstream;
00040 
00041 #include <curl/curl.h>
00042 
00043 #include <expat.h>
00044 
00045 #define TP_TEMPLATE_LOCATION "http://openmodeller.cria.org.br/xml/tapir/1.0/st_v3.xml"
00046 #define TP_TEMPLATE_STRSIZE 55
00047 #define TP_OUTPUT_MODEL_LOCATION "http://openmodeller.cria.org.br/xml/tapir/1.0/om.xml"
00048 #define TP_OUTPUT_MODEL_STRSIZE 52
00049 
00050 /*****************************************/
00051 /*** create OccurrencesReader callback ***/
00052 OccurrencesReader * 
00053 TapirOccurrences::CreateOccurrencesReaderCallback( const char *source, const char *coordSystem )
00054 {
00055   return new TapirOccurrences( source, coordSystem );
00056 }
00057 
00058 /***************************/
00059 /*** _curl header writer ***/
00060 size_t 
00061 TapirOccurrences::_curl_header_writer( void *ptr, size_t size, size_t nmemb, void *stream )
00062 {
00063    if ( stream == NULL ) {
00064 
00065      return 0;
00066    }
00067 
00068   // *stream is actually a string object
00069   std::string& str = *( reinterpret_cast<std::string*>( stream ) );
00070 
00071   str.append( reinterpret_cast<const char*>(ptr), size*nmemb );
00072 
00073   return size*nmemb;
00074 }
00075 
00076 /*************************/
00077 /*** _curl body writer ***/
00078 size_t 
00079 TapirOccurrences::_curl_body_writer( void *ptr, size_t size, size_t nmemb, void *stream )
00080 {
00081    if ( stream == NULL ) {
00082 
00083      return 0;
00084    }
00085 
00086   // *stream is actually a string object
00087   std::string& str = *( reinterpret_cast<std::string*>( stream ) );
00088 
00089   str.append( reinterpret_cast<const char*>(ptr), size*nmemb );
00090 
00091   return size*nmemb;
00092 }
00093 
00094 /*******************/
00095 /*** Constructor ***/
00096 TapirOccurrences::TapirOccurrences( const char *source, const char *coordSystem )
00097 {
00098   if ( curl_global_init( CURL_GLOBAL_ALL ) != CURLE_OK ) {
00099 
00100     throw OccurrencesReaderException( "Could not initialize libcurl" );
00101   }
00102 
00103   _loaded = false;
00104 
00105   _source = (char *) source; // TAPIR endpoint
00106 
00107   _coord_system = (char *) coordSystem;
00108 
00109   _default_limit = 100;
00110 }
00111 
00112 
00113 /******************/
00114 /*** Destructor ***/
00115 TapirOccurrences::~TapirOccurrences()
00116 {
00117   curl_global_cleanup();
00118 }
00119 
00120 
00121 /************/
00122 /*** load ***/
00123 bool
00124 TapirOccurrences::load()
00125 {
00126   if ( _loaded ) {
00127 
00128     return true;
00129   }
00130 
00131   Log::instance()->info( "Checking endpoint using TAPIR driver\n" );
00132 
00133   Log::instance()->info( "CURL version is %s\n", curl_version() );
00134 
00135   // Prepare CURL handle
00136   CURL * curl_handle = curl_easy_init(); 
00137 
00138   if ( curl_handle == NULL ) {
00139 
00140     Log::instance()->error( "TapirOccurrences::load - Could not initialize curl handle\n" );
00141     return false;
00142   }
00143 
00144   // Prepare TAPIR capabilities request
00145   std::string capabilities_url( _source );
00146 
00147   if ( capabilities_url.find( "?" ) != string::npos ) {
00148 
00149     capabilities_url.append( "&" );
00150   }
00151   else {
00152 
00153     capabilities_url.append( "?" );
00154   } 
00155 
00156   capabilities_url.append( "op=c" );
00157 
00158   // Set CURL options
00159   if ( curl_easy_setopt( curl_handle, CURLOPT_URL, capabilities_url.c_str() ) != CURLE_OK ) {
00160 
00161     Log::instance()->error( "TapirOccurrences::load - Failed to set CURLOPT_URL\n" );
00162     curl_easy_cleanup( curl_handle );
00163     return false;
00164   }
00165 
00166   if ( curl_easy_setopt( curl_handle, CURLOPT_HEADERFUNCTION, &TapirOccurrences::_curl_header_writer ) != CURLE_OK ) {
00167 
00168     Log::instance()->error( "TapirOccurrences::load - Failed to set CURLOPT_HEADERFUNCTION\n" );
00169     curl_easy_cleanup( curl_handle );
00170     return false;
00171   }
00172 
00173   std::string header;
00174 
00175   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEHEADER, &header ) != CURLE_OK ) {
00176 
00177     Log::instance()->error( "TapirOccurrences::load - Failed to set CURLOPT_WRITEHEADER\n" );
00178     curl_easy_cleanup( curl_handle );
00179     return false;
00180   }
00181 
00182   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEFUNCTION, &TapirOccurrences::_curl_body_writer ) != CURLE_OK ) {
00183 
00184     Log::instance()->error( "TapirOccurrences::load - Failed to set CURLOPT_WRITEFUNCTION\n" );
00185     curl_easy_cleanup( curl_handle );
00186     return false;
00187   }
00188 
00189   std::string capabilities_response;
00190 
00191   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEDATA, &capabilities_response ) != CURLE_OK ) {
00192 
00193     Log::instance()->error( "TapirOccurrences::load - Failed to set CURLOPT_WRITEDATA\n" );
00194     curl_easy_cleanup( curl_handle );
00195     return false;
00196   }
00197 
00198   Log::instance()->info( "Getting capabilities using %s\n", capabilities_url.c_str() );
00199 
00200   // Send capabilities request
00201   CURLcode curl_code = curl_easy_perform( curl_handle );
00202 
00203   if ( curl_code != CURLE_OK )
00204   {
00205     Log::instance()->error( "TapirOccurrences::load - Could not get service capabilities from the specified endpoint (CURL code error: %d)\n", curl_code );
00206     curl_easy_cleanup( curl_handle );
00207     return false;
00208   }
00209 
00210   curl_easy_cleanup( curl_handle );
00211 
00212   // Check content type returned
00213   if ( header.find( "Content-Type: text/xml" ) == string::npos )
00214   {
00215     Log::instance()->warn( "TapirOccurrences::load - URL does not seem to be a TAPIR endpoint (HTTP Content-Type header is not text/xml)\n" );
00216 
00217     return false;
00218   }
00219 
00220   // Data structure to store relevant capabilities information
00221   CapabilitiesInfo info;
00222 
00223   // Parse capabilities response
00224   if ( ! _parseCapabilities( &capabilities_response, &info ) ) {
00225 
00226     return false;
00227   }
00228 
00229   // Check capabilitites response
00230   if ( ! info._is_tapir ) {
00231 
00232     Log::instance()->warn( "TapirOccurrences::load - URL does not seem to be a TAPIR endpoint (no TAPIR response element detected)\n" );
00233     return false;
00234   }
00235   if ( ! info._has_guid ) {
00236 
00237     Log::instance()->error( "TapirOccurrences::load - Provider did not map the DarwinCore GlobalUniqueIdentifier concept\n" );
00238     return false;
00239   }
00240   if ( ! info._has_name ) {
00241 
00242     Log::instance()->error( "TapirOccurrences::load - Provider did not map the DarwinCore ScientificName concept\n" );
00243     return false;
00244   }
00245   if ( ! info._has_long ) {
00246 
00247     Log::instance()->error( "TapirOccurrences::load - Provider did not map the DarwinCore DecimalLongitude concept from the geospatial extension\n" );
00248     return false;
00249   }
00250   if ( ! info._has_lat ) {
00251 
00252     Log::instance()->error( "TapirOccurrences::load - Provider did not map the DarwinCore DecimalLatitude concept from the geospatial extension\n" );
00253     return false;
00254   }
00255   if ( ( ! info._accepts_om_template ) && ( ! info._accepts_any_model ) && ! info._accepts_om_model ) {
00256 
00257     Log::instance()->error( "TapirOccurrences::load - Provider must accept searches with the openModeller query template or searches with the openModeller output model or searches with any output model\n" );
00258     return false;
00259   }
00260 
00261   Log::instance()->info( "Finished reading capabilities\n" );
00262 
00263   _loaded = true;
00264 
00265   return true;
00266 }
00267 
00268 
00269 /**************************/
00270 /*** parse capabilities ***/
00271 bool
00272 TapirOccurrences::_parseCapabilities( const std::string *xml, CapabilitiesInfo *info )
00273 {
00274   XML_Parser parser = XML_ParserCreateNS( NULL, '/' );
00275 
00276   if ( ! parser ) {
00277 
00278     Log::instance()->error( "Unable to allocate memory for capabilities response parser" );
00279     return false;
00280   }
00281 
00282   info->_parser = parser;
00283 
00284   XML_SetElementHandler( parser, &TapirOccurrences::_startCapabilitiesElement, NULL );
00285 
00286   XML_SetUserData( parser, info );
00287 
00288   if ( ! XML_Parse( parser, xml->c_str(), xml->size(), 1 ) ) {
00289 
00290     XML_Error error_code =  XML_GetErrorCode( parser );
00291     std::ostringstream errormsg;
00292     errormsg << XML_ErrorString( error_code )
00293              << " at Line "
00294              << XML_GetCurrentLineNumber( parser )
00295              << " column "
00296              << XML_GetCurrentColumnNumber( parser )
00297              << std::ends;
00298 
00299     Log::instance()->error( "XML Parser fatal error for capabilities response: %s\n", errormsg.str().c_str() );
00300     XML_ParserFree( parser );
00301     return false;
00302   }
00303   
00304   XML_ParserFree( parser );
00305 
00306   _provider_limit = atoi( info->_max_records.c_str() );
00307 
00308   return true;
00309 }
00310 
00311 
00312 /**********************************/
00313 /*** start capabilities element ***/
00314 void 
00315 TapirOccurrences::_startCapabilitiesElement( void *data, const char *el, const char **attr )
00316 {
00317   CapabilitiesInfo& info = *( reinterpret_cast<CapabilitiesInfo*>( data ) );
00318 
00319   // response element
00320   if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/response", 37 ) == 0 ) {
00321 
00322     info._is_tapir = true;
00323   }
00324   // mappedConcept element
00325   else if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/mappedConcept", 42 ) == 0 ) {
00326 
00327     for ( int i = 0; attr[i]; i += 2 ) {
00328 
00329       // id attribute  (TODO: check "searchable" attribute too)
00330       if ( strncmp( attr[i], "id", 2 ) == 0 ) {
00331 
00332         if ( strncmp( attr[i+1], "http://rs.tdwg.org/dwc/dwcore/GlobalUniqueIdentifier", 52 ) == 0 ) {
00333           info._has_guid = true;
00334         }
00335         else if ( strncmp( attr[i+1], "http://rs.tdwg.org/dwc/dwcore/ScientificName", 44 ) == 0 ) {
00336 
00337           info._has_name = true;
00338         }
00339         else if ( strncmp( attr[i+1], "http://rs.tdwg.org/dwc/geospatial/DecimalLongitude", 50 ) == 0 ) {
00340 
00341           info._has_long = true;
00342         }
00343         else if ( strncmp( attr[i+1], "http://rs.tdwg.org/dwc/geospatial/DecimalLatitude/", 49 ) == 0 ) {
00344 
00345           info._has_lat = true;
00346         }
00347       }
00348     }
00349   }
00350   // template element
00351   else if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/template", 37 ) == 0 ) {
00352 
00353     for ( int i = 0; attr[i]; i += 2 ) {
00354 
00355       // location attribute
00356       if ( strncmp( attr[i], "location", 8 ) == 0 && 
00357            strncmp( attr[i+1], TP_TEMPLATE_LOCATION, TP_TEMPLATE_STRSIZE ) == 0 ) {
00358 
00359         info._accepts_om_template = true;
00360       }
00361     }
00362   }
00363   // outputModel element
00364   else if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/outputModel", 40 ) == 0 ) {
00365 
00366     for ( int i = 0; attr[i]; i += 2 ) {
00367 
00368       // location attribute
00369       if ( strncmp( attr[i], "location", 8 ) == 0 && 
00370            strncmp( attr[i+1], TP_OUTPUT_MODEL_LOCATION, TP_OUTPUT_MODEL_STRSIZE ) == 0 ) {
00371 
00372         info._accepts_om_model = true;
00373       }
00374     }
00375   }
00376   // anyOutputModels element
00377   else if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/anyOutputModels", 44 ) == 0 ) {
00378 
00379     info._accepts_any_model = true;
00380   }
00381   // maxElementRepetitions element
00382   else if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/maxElementRepetitions", 50 ) == 0 ) {
00383     XML_SetCharacterDataHandler( info._parser, &TapirOccurrences::_maxRecordsDataHandler );
00384 
00385     XML_SetElementHandler( info._parser, &TapirOccurrences::_startCapabilitiesElement, 
00386                                          &TapirOccurrences::_endCapabilitiesElement );
00387   }
00388 }
00389 
00390 
00391 /********************************/
00392 /*** max records data handler ***/
00393 void 
00394 TapirOccurrences::_maxRecordsDataHandler( void *data, const char *value, int len )
00395 {
00396   CapabilitiesInfo& info = *( reinterpret_cast<CapabilitiesInfo*>( data ) );
00397 
00398   info._max_records.append( value, len );
00399 }
00400 
00401 
00402 /********************************/
00403 /*** end capabilities element ***/
00404 void 
00405 TapirOccurrences::_endCapabilitiesElement( void *data, const char *el )
00406 {
00407   // maxElementRepetitions element
00408   if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/maxElementRepetitions", 50 ) == 0 ) {
00409 
00410     CapabilitiesInfo& info = *( reinterpret_cast<CapabilitiesInfo*>( data ) );
00411 
00412     XML_SetCharacterDataHandler( info._parser, NULL );
00413     XML_SetElementHandler( info._parser, &TapirOccurrences::_startCapabilitiesElement, NULL );
00414   }
00415 }
00416 
00417 
00418 /*********************/
00419 /*** get Presences ***/
00420 OccurrencesPtr
00421 TapirOccurrences::getPresences( const char *groupId )
00422 {
00423   // If group was not specified, return empty set
00424   if ( ! groupId ) {
00425 
00426     return new OccurrencesImpl( 1 );
00427   }
00428 
00429   LstOccurrences::iterator ocs = _presences.begin();
00430   LstOccurrences::iterator end = _presences.end();
00431 
00432   while ( ocs != end ) {
00433 
00434     OccurrencesPtr oc = *ocs;
00435 
00436     if ( ! strcasecmp( groupId, oc->label() ) ) {
00437 
00438       _presences.erase( ocs );
00439 
00440       return oc;
00441     }
00442 
00443     ++ocs;
00444   }
00445 
00446   // If not found, create new group and retrieve records from provider
00447 
00448   OccurrencesPtr occurrences( new OccurrencesImpl( groupId, _coord_system ) );
00449 
00450   TapirRecordData search_data;
00451 
00452   search_data._occurrences = occurrences;
00453   search_data._next = 0;
00454 
00455   int limit = _default_limit;
00456 
00457   if ( _provider_limit > 0 && _provider_limit < _default_limit ) {
00458 
00459     limit = _provider_limit;
00460   }
00461 
00462   while ( search_data._next >= 0 ) {
00463 
00464     Log::instance()->info( "Fetching records (start %d, limit %d)\n", search_data._next, limit );
00465 
00466     if ( ! _retrieveRecords( &search_data, limit ) ) {
00467 
00468       break;
00469     }
00470   }
00471 
00472   _presences.push_back( occurrences );
00473 
00474   return occurrences;
00475 }
00476 
00477 
00478 /************************/
00479 /*** retrieve Records ***/
00480 bool
00481 TapirOccurrences::_retrieveRecords( TapirRecordData *data, int limit )
00482 {
00483   // Prepare CURL handle
00484   CURL * curl_handle = curl_easy_init(); 
00485 
00486   if ( curl_handle == NULL ) {
00487 
00488     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Could not initialize curl handle\n" );
00489     return false;
00490   }
00491 
00492   // Prepare TAPIR search request
00493   std::string source( _source );
00494 
00495   std::ostringstream search_url;
00496 
00497   search_url << source.c_str();
00498 
00499   if ( source.find( "?" ) != string::npos ) {
00500 
00501     search_url <<  "&";
00502   }
00503   else {
00504 
00505     search_url << "?";
00506   } 
00507 
00508   search_url << "op=s&s=" << data->_next;
00509 
00510   search_url << "&l=" << limit;
00511 
00512 // curl_easy_escape was included in libcurl version 7.15.4
00513 #if LIBCURL_VERSION_NUM >= 0x070f04
00514   search_url << "&sciname=" << curl_easy_escape( curl_handle, data->_occurrences->label(), 0 );
00515   search_url << "&t=" << curl_easy_escape( curl_handle, TP_TEMPLATE_LOCATION, 0 );
00516 #else
00517   search_url << "&sciname=" << curl_escape( data->_occurrences->label(), 0 );
00518   search_url << "&t=" << curl_escape( TP_TEMPLATE_LOCATION, 0 );
00519 #endif
00520 
00521   // After using next to make the URL, set it to -1 to stop the process in case 
00522   // the response does not return the "next" attribute
00523   data->_next = -1;
00524 
00525   // Set CURL options
00526   if ( curl_easy_setopt( curl_handle, CURLOPT_URL, search_url.str().c_str() ) != CURLE_OK ) {
00527 
00528     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Failed to set CURLOPT_URL\n" );
00529     curl_easy_cleanup( curl_handle );
00530     return false;
00531   }
00532 
00533   if ( curl_easy_setopt( curl_handle, CURLOPT_HEADERFUNCTION, &TapirOccurrences::_curl_header_writer ) != CURLE_OK ) {
00534 
00535     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Failed to set CURLOPT_HEADERFUNCTION\n" );
00536     curl_easy_cleanup( curl_handle );
00537     return false;
00538   }
00539 
00540   std::string header;
00541 
00542   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEHEADER, &header ) != CURLE_OK ) {
00543 
00544     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Failed to set CURLOPT_WRITEHEADER\n" );
00545     curl_easy_cleanup( curl_handle );
00546     return false;
00547   }
00548 
00549   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEFUNCTION, &TapirOccurrences::_curl_body_writer ) != CURLE_OK ) {
00550 
00551     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Failed to set CURLOPT_WRITEFUNCTION\n" );
00552     curl_easy_cleanup( curl_handle );
00553     return false;
00554   }
00555 
00556   std::string search_response;
00557 
00558   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEDATA, &search_response ) != CURLE_OK ) {
00559 
00560     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Failed to set CURLOPT_WRITEDATA\n" );
00561     curl_easy_cleanup( curl_handle );
00562     return false;
00563   }
00564 
00565   Log::instance()->info( "Searching records using %s\n", search_url.str().c_str() );
00566 
00567   // Send search request
00568   CURLcode curl_code = curl_easy_perform( curl_handle );
00569 
00570   if ( curl_code != CURLE_OK )
00571   {
00572     const char * error_msg = curl_easy_strerror( curl_code );
00573 
00574     Log::instance()->error( "TapirOccurrences::_retrieveRecords - Could not search records from the specified TAPIR endpoint (CURL error: %s)\n", error_msg );
00575 
00576     curl_easy_cleanup( curl_handle );
00577     return false;
00578   }
00579 
00580   curl_easy_cleanup( curl_handle );
00581 
00582   // Check content type returned
00583   if ( header.find( "Content-Type: text/xml" ) == string::npos )
00584   {
00585     Log::instance()->error( "TapirOccurrences::_retrieveRecords - URL does not seem to be a TAPIR endpoint (HTTP Content-Type header is not text/xml)\n" );
00586     return false;
00587   }
00588 
00589   // Parse search response
00590   XML_Parser parser = XML_ParserCreateNS( NULL, '/' );
00591 
00592   if ( ! parser ) {
00593 
00594     Log::instance()->error( "Unable to allocate memory for search response parser" );
00595     return false;
00596   }
00597 
00598   XML_SetElementHandler( parser, &TapirOccurrences::_startSearchElement, NULL );
00599 
00600   XML_SetUserData( parser, data );
00601 
00602   if ( ! XML_Parse( parser, search_response.c_str(), search_response.size(), 1 ) ) {
00603 
00604     XML_Error error_code =  XML_GetErrorCode( parser );
00605     std::ostringstream errormsg;
00606     errormsg << XML_ErrorString( error_code )
00607              << " at Line "
00608              << XML_GetCurrentLineNumber( parser )
00609              << " column "
00610              << XML_GetCurrentColumnNumber( parser )
00611              << std::ends;
00612 
00613     Log::instance()->error( "XML Parser fatal error for search response: %s\n", errormsg.str().c_str() );
00614     XML_ParserFree( parser );
00615     return false;
00616   }
00617   
00618   XML_ParserFree( parser );
00619 
00620   return true;
00621 }
00622 
00623 
00624 /****************************/
00625 /*** start search element ***/
00626 void 
00627 TapirOccurrences::_startSearchElement( void *data, const char *el, const char **attr )
00628 {
00629   TapirRecordData& search_data = *( reinterpret_cast<TapirRecordData*>( data ) );
00630 
00631   // occ element
00632   if ( strlen( el ) == 49 && 
00633        strncmp( el, "http://openmodeller.cria.org.br/xml/tapir/1.0/occ", 49 ) == 0 ) {
00634 
00635     std::string guid("");
00636 
00637     Coord lg = 0;
00638     Coord lt = 0;
00639 
00640     for ( int i = 0; attr[i]; i += 2 ) {
00641 
00642       // guid attribute
00643       if ( strncmp( attr[i], "guid", 4 ) == 0 ) {
00644 
00645         guid = attr[i+1];
00646       }
00647       // long attribute
00648       else if ( strncmp( attr[i], "long", 4 ) == 0 ) {
00649 
00650         lg = Coord( atof( attr[i+1] ) );
00651       }
00652       // lat attribute
00653       else if ( strncmp( attr[i], "lat", 3 ) == 0 ) {
00654 
00655         lt = Coord( atof( attr[i+1] ) );
00656       }
00657       // TODO: get datum and convert coordinates when necessary
00658     }
00659 
00660     search_data._occurrences->createOccurrence( guid.c_str(), lg, lt, 0.0, 1, 0, 0 );
00661   }
00662   // summary element
00663   if ( strncmp( el, "http://rs.tdwg.org/tapir/1.0/summary", 36 ) == 0 ) {
00664 
00665     for ( int i = 0; attr[i]; i += 2 ) {
00666 
00667       // next attribute
00668       if ( strncmp( attr[i], "next", 4 ) == 0 ) {
00669 
00670         search_data._next = atoi( attr[i+1] );
00671       }
00672     }
00673   }
00674 }