openModeller  Version 1.4.0
GbifOccurrences.cpp
Go to the documentation of this file.
00001 
00027 #include <openmodeller/occ_io/GbifOccurrences.hh>
00028 
00029 #include <openmodeller/Occurrences.hh>
00030 
00031 #include <openmodeller/Exceptions.hh>
00032 
00033 #include <stdlib.h>
00034 
00035 //c style include needed for strcmp etc
00036 #include <string.h>
00037 using std::string;
00038 
00039 #include <sstream>
00040 using std::ostringstream;
00041 
00042 #include <curl/curl.h>
00043 
00044 #include <expat.h>
00045 
00046 /*****************************************/
00047 /*** create OccurrencesReader callback ***/
00048 OccurrencesReader * 
00049 GbifOccurrences::CreateOccurrencesReaderCallback( const char *source, const char *coordSystem )
00050 {
00051   return new GbifOccurrences( source, coordSystem );
00052 }
00053 
00054 /***************************/
00055 /*** _curl header writer ***/
00056 size_t 
00057 GbifOccurrences::_curl_header_writer( void *ptr, size_t size, size_t nmemb, void *stream )
00058 {
00059    if ( stream == NULL ) {
00060 
00061      return 0;
00062    }
00063 
00064   // *stream is actually a string object
00065   std::string& str = *( reinterpret_cast<std::string*>( stream ) );
00066 
00067   str.append( reinterpret_cast<const char*>(ptr), size*nmemb );
00068 
00069   return size*nmemb;
00070 }
00071 
00072 /*************************/
00073 /*** _curl body writer ***/
00074 size_t 
00075 GbifOccurrences::_curl_body_writer( void *ptr, size_t size, size_t nmemb, void *stream )
00076 {
00077    if ( stream == NULL ) {
00078 
00079      return 0;
00080    }
00081 
00082   // *stream is actually a string object
00083   std::string& str = *( reinterpret_cast<std::string*>( stream ) );
00084 
00085   str.append( reinterpret_cast<const char*>(ptr), size*nmemb );
00086 
00087   return size*nmemb;
00088 }
00089 
00090 /*******************/
00091 /*** Constructor ***/
00092 GbifOccurrences::GbifOccurrences( const char *source, const char *coordSystem )
00093 {
00094   if ( curl_global_init( CURL_GLOBAL_ALL ) != CURLE_OK ) {
00095 
00096     throw OccurrencesReaderException( "Could not initialize libcurl" );
00097   }
00098 
00099   _loaded = false;
00100 
00101   _source = (char *) source; // endpoint
00102 
00103   _coord_system = (char *) coordSystem;
00104 
00105   _default_limit = 100;
00106 }
00107 
00108 
00109 /******************/
00110 /*** Destructor ***/
00111 GbifOccurrences::~GbifOccurrences()
00112 {
00113   curl_global_cleanup();
00114 }
00115 
00116 
00117 /************/
00118 /*** load ***/
00119 bool
00120 GbifOccurrences::load()
00121 {
00122   if ( _loaded ) {
00123 
00124     return true;
00125   }
00126 
00127   Log::instance()->info( "Checking endpoint using GBIF driver\n" );
00128 
00129   Log::instance()->info( "CURL version is %s\n", curl_version() );
00130 
00131   // Prepare CURL handle
00132   CURL * curl_handle = curl_easy_init(); 
00133 
00134   if ( curl_handle == NULL ) {
00135 
00136     Log::instance()->error( "GbifOccurrences::load - Could not initialize curl handle\n" );
00137     return false;
00138   }
00139 
00140   // Prepare request with no parameters just to check if it's the GBIF service
00141 
00142 
00143   // Set CURL options
00144   if ( curl_easy_setopt( curl_handle, CURLOPT_URL, _source ) != CURLE_OK ) {
00145 
00146     Log::instance()->error( "GbifOccurrences::load - Failed to set CURLOPT_URL\n" );
00147     curl_easy_cleanup( curl_handle );
00148     return false;
00149   }
00150 
00151   if ( curl_easy_setopt( curl_handle, CURLOPT_HEADERFUNCTION, &GbifOccurrences::_curl_header_writer ) != CURLE_OK ) {
00152 
00153     Log::instance()->error( "GbifOccurrences::load - Failed to set CURLOPT_HEADERFUNCTION\n" );
00154     curl_easy_cleanup( curl_handle );
00155     return false;
00156   }
00157 
00158   std::string header;
00159 
00160   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEHEADER, &header ) != CURLE_OK ) {
00161 
00162     Log::instance()->error( "GbifOccurrences::load - Failed to set CURLOPT_WRITEHEADER\n" );
00163     curl_easy_cleanup( curl_handle );
00164     return false;
00165   }
00166 
00167   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEFUNCTION, &GbifOccurrences::_curl_body_writer ) != CURLE_OK ) {
00168 
00169     Log::instance()->error( "GbifOccurrences::load - Failed to set CURLOPT_WRITEFUNCTION\n" );
00170     curl_easy_cleanup( curl_handle );
00171     return false;
00172   }
00173 
00174   std::string response;
00175 
00176   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEDATA, &response ) != CURLE_OK ) {
00177 
00178     Log::instance()->error( "GbifOccurrences::load - Failed to set CURLOPT_WRITEDATA\n" );
00179     curl_easy_cleanup( curl_handle );
00180     return false;
00181   }
00182 
00183   Log::instance()->info( "Getting service info using %s\n", _source );
00184 
00185   // Send request
00186   CURLcode curl_code = curl_easy_perform( curl_handle );
00187 
00188   if ( curl_code != CURLE_OK )
00189   {
00190     Log::instance()->error( "GbifOccurrences::load - Could not get service info from the specified endpoint (CURL code error: %d)\n", curl_code );
00191     curl_easy_cleanup( curl_handle );
00192     return false;
00193   }
00194 
00195   curl_easy_cleanup( curl_handle );
00196 
00197   // Check content type returned
00198   if ( header.find( "Content-Type: text/xml" ) == string::npos )
00199   {
00200     Log::instance()->error( "GbifOccurrences::load - URL does not seem to be a GBIF REST WS endpoint (HTTP Content-Type header is not text/xml)\n" );
00201 
00202     Log::instance()->info( "HEADER: %s\n", header.c_str() );
00203 
00204     return false;
00205   }
00206 
00207   // Data structure to store relevant service information
00208   ServiceInfo info;
00209 
00210   // Parse response
00211   if ( ! _parseServiceInfo( &response, &info ) ) {
00212 
00213     return false;
00214   }
00215 
00216   // Check response
00217   if ( ! info._is_gbif ) {
00218 
00219     Log::instance()->error( "GbifOccurrences::load - URL does not seem to be a GBIF WS REST endpoint (root element is not gbifResponse)\n" );
00220     return false;
00221   }
00222 
00223   Log::instance()->info( "Finished checking service info\n" );
00224 
00225   _loaded = true;
00226 
00227   return true;
00228 }
00229 
00230 
00231 /**************************/
00232 /*** parse service info ***/
00233 bool
00234 GbifOccurrences::_parseServiceInfo( const std::string *xml, ServiceInfo *info )
00235 {
00236   XML_Parser parser = XML_ParserCreateNS( NULL, '/' );
00237 
00238   if ( ! parser ) {
00239 
00240     Log::instance()->error( "Unable to allocate memory for capabilities response parser" );
00241     return false;
00242   }
00243 
00244   info->_parser = parser;
00245 
00246   XML_SetElementHandler( parser, &GbifOccurrences::_startServiceInfoElement, NULL );
00247 
00248   XML_SetUserData( parser, info );
00249 
00250   if ( ! XML_Parse( parser, xml->c_str(), xml->size(), 1 ) ) {
00251 
00252     XML_Error error_code =  XML_GetErrorCode( parser );
00253     std::ostringstream errormsg;
00254     errormsg << XML_ErrorString( error_code )
00255              << " at Line "
00256              << XML_GetCurrentLineNumber( parser )
00257              << " column "
00258              << XML_GetCurrentColumnNumber( parser )
00259              << std::ends;
00260 
00261     Log::instance()->error( "XML Parser fatal error for capabilities response: %s\n", errormsg.str().c_str() );
00262     XML_ParserFree( parser );
00263     return false;
00264   }
00265   
00266   XML_ParserFree( parser );
00267 
00268   _provider_limit = atoi( info->_max_records.c_str() );
00269 
00270   return true;
00271 }
00272 
00273 
00274 /**********************************/
00275 /*** start service info element ***/
00276 void 
00277 GbifOccurrences::_startServiceInfoElement( void *data, const char *el, const char **attr )
00278 {
00279   ServiceInfo& info = *( reinterpret_cast<ServiceInfo*>( data ) );
00280 
00281   // gbifResponse element
00282   if ( strncmp( el, "http://portal.gbif.org/ws/response/gbif/gbifResponse", 52 ) == 0 ) {
00283 
00284     info._is_gbif = true;
00285   }
00286   // parameter element
00287   else if ( strncmp( el, "http://portal.gbif.org/ws/response/gbif/parameter", 49 ) == 0 ) {
00288 
00289     bool is_maxresults = false;
00290 
00291     std::string value("");
00292 
00293     for ( int i = 0; attr[i]; i += 2 ) {
00294 
00295       // name attribute
00296       if ( strncmp( attr[i], "name", 4 ) == 0 ) {
00297 
00298         if ( strncmp( attr[i+1], "maxresults", 10 ) == 0 ) {
00299           is_maxresults = true;
00300         }
00301       }
00302       else if ( strncmp( attr[i], "value", 5 ) == 0 ) {
00303 
00304         value = attr[i+1];
00305       }
00306 
00307       if ( is_maxresults ) {
00308 
00309         info._max_records = value;
00310 
00311         // No need to keep parsing
00312         XML_SetElementHandler( info._parser, NULL, NULL );
00313       }
00314     }
00315   }
00316 }
00317 
00318 
00319 /*********************/
00320 /*** get Presences ***/
00321 OccurrencesPtr
00322 GbifOccurrences::getPresences( const char *groupId )
00323 {
00324   // If group was not specified, return empty set
00325   if ( ! groupId ) {
00326 
00327     return new OccurrencesImpl( 1 );
00328   }
00329 
00330   LstOccurrences::iterator ocs = _presences.begin();
00331   LstOccurrences::iterator end = _presences.end();
00332 
00333   while ( ocs != end ) {
00334 
00335     OccurrencesPtr oc = *ocs;
00336 
00337     if ( ! strcasecmp( groupId, oc->label() ) ) {
00338 
00339       _presences.erase( ocs );
00340 
00341       return oc;
00342     }
00343 
00344     ++ocs;
00345   }
00346 
00347   // If not found, create new group and retrieve records from provider
00348 
00349   OccurrencesPtr occurrences( new OccurrencesImpl( groupId, _coord_system ) );
00350 
00351   GbifRecordData search_data;
00352 
00353   search_data._occurrences = occurrences;
00354   search_data._next = 0;
00355 
00356   int limit = _default_limit;
00357 
00358   if ( _provider_limit > 0 && _provider_limit < _default_limit ) {
00359 
00360     limit = _provider_limit;
00361   }
00362 
00363   while ( search_data._next >= 0 ) {
00364 
00365     Log::instance()->info( "Fetching records (start %d, limit %d)\n", search_data._next, limit );
00366 
00367     if ( ! _retrieveRecords( &search_data, limit ) ) {
00368 
00369       break;
00370     }
00371   }
00372 
00373   _presences.push_back( occurrences );
00374 
00375   return occurrences;
00376 }
00377 
00378 
00379 /************************/
00380 /*** retrieve Records ***/
00381 bool
00382 GbifOccurrences::_retrieveRecords( GbifRecordData *data, int limit )
00383 {
00384   // Prepare CURL handle
00385   CURL * curl_handle = curl_easy_init(); 
00386 
00387   if ( curl_handle == NULL ) {
00388 
00389     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Could not initialize curl handle\n" );
00390     return false;
00391   }
00392 
00393   // Prepare search request
00394   std::string source( _source );
00395 
00396   std::ostringstream search_url;
00397 
00398   search_url << source.c_str();
00399 
00400   if ( source.find( "?" ) != string::npos ) {
00401 
00402     search_url <<  "&";
00403   }
00404   else {
00405 
00406     search_url << "?";
00407   } 
00408 
00409   search_url << "startindex=" << data->_next;
00410 
00411   search_url << "&maxresults=" << limit;
00412 
00413 // curl_easy_escape was included in libcurl version 7.15.4
00414 #if LIBCURL_VERSION_NUM >= 0x070f04
00415   search_url << "&scientificname=" << curl_easy_escape( curl_handle, data->_occurrences->label(), 0 );
00416 #else
00417   search_url << "&scientificname=" << curl_escape( data->_occurrences->label(), 0 );
00418 #endif
00419 
00420   search_url << "&format=brief&coordinatestatus=true&coordinateissues=false";
00421 
00422   // After using next to make the URL, set it to -1 to stop the process in case 
00423   // the response does not return the "next" attribute
00424   data->_next = -1;
00425 
00426   // Set CURL options
00427   if ( curl_easy_setopt( curl_handle, CURLOPT_URL, search_url.str().c_str() ) != CURLE_OK ) {
00428 
00429     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Failed to set CURLOPT_URL\n" );
00430     curl_easy_cleanup( curl_handle );
00431     return false;
00432   }
00433 
00434   if ( curl_easy_setopt( curl_handle, CURLOPT_HEADERFUNCTION, &GbifOccurrences::_curl_header_writer ) != CURLE_OK ) {
00435 
00436     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Failed to set CURLOPT_HEADERFUNCTION\n" );
00437     curl_easy_cleanup( curl_handle );
00438     return false;
00439   }
00440 
00441   std::string header;
00442 
00443   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEHEADER, &header ) != CURLE_OK ) {
00444 
00445     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Failed to set CURLOPT_WRITEHEADER\n" );
00446     curl_easy_cleanup( curl_handle );
00447     return false;
00448   }
00449 
00450   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEFUNCTION, &GbifOccurrences::_curl_body_writer ) != CURLE_OK ) {
00451 
00452     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Failed to set CURLOPT_WRITEFUNCTION\n" );
00453     curl_easy_cleanup( curl_handle );
00454     return false;
00455   }
00456 
00457   std::string search_response;
00458 
00459   if ( curl_easy_setopt( curl_handle, CURLOPT_WRITEDATA, &search_response ) != CURLE_OK ) {
00460 
00461     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Failed to set CURLOPT_WRITEDATA\n" );
00462     curl_easy_cleanup( curl_handle );
00463     return false;
00464   }
00465 
00466   Log::instance()->info( "Searching records using %s\n", search_url.str().c_str() );
00467 
00468   // Send search request
00469   CURLcode curl_code = curl_easy_perform( curl_handle );
00470 
00471   if ( curl_code != CURLE_OK )
00472   {
00473     const char * error_msg = curl_easy_strerror( curl_code );
00474 
00475     Log::instance()->error( "GbifOccurrences::_retrieveRecords - Could not search records from the specified GBIF endpoint (CURL error: %s)\n", error_msg );
00476 
00477     curl_easy_cleanup( curl_handle );
00478     return false;
00479   }
00480 
00481   curl_easy_cleanup( curl_handle );
00482 
00483   // Check content type returned
00484   if ( header.find( "Content-Type: text/xml" ) == string::npos )
00485   {
00486     Log::instance()->error( "GbifOccurrences::_retrieveRecords - URL does not seem to be a GBIF WS REST endpoint (HTTP Content-Type header is not text/xml)\n" );
00487     return false;
00488   }
00489 
00490   // Parse search response
00491   XML_Parser parser = XML_ParserCreateNS( NULL, '/' );
00492 
00493   if ( ! parser ) {
00494 
00495     Log::instance()->error( "Unable to allocate memory for search response parser" );
00496     return false;
00497   }
00498 
00499   data->_parser = parser;
00500 
00501   XML_SetElementHandler( parser, &GbifOccurrences::_startSearchElement, &GbifOccurrences::_endSearchElement );
00502 
00503   XML_SetUserData( parser, data );
00504 
00505   if ( ! XML_Parse( parser, search_response.c_str(), search_response.size(), 1 ) ) {
00506 
00507     XML_Error error_code =  XML_GetErrorCode( parser );
00508     std::ostringstream errormsg;
00509     errormsg << XML_ErrorString( error_code )
00510              << " at Line "
00511              << XML_GetCurrentLineNumber( parser )
00512              << " column "
00513              << XML_GetCurrentColumnNumber( parser )
00514              << std::ends;
00515 
00516     Log::instance()->error( "XML Parser fatal error for search response: %s\n", errormsg.str().c_str() );
00517     XML_ParserFree( parser );
00518     return false;
00519   }
00520   
00521   XML_ParserFree( parser );
00522 
00523   return true;
00524 }
00525 
00526 
00527 /****************************/
00528 /*** start search element ***/
00529 void 
00530 GbifOccurrences::_startSearchElement( void *data, const char *el, const char **attr )
00531 {
00532   GbifRecordData& search_data = *( reinterpret_cast<GbifRecordData*>( data ) );
00533 
00534   // summary element
00535   if ( strncmp( el, "http://portal.gbif.org/ws/response/gbif/summary", 47 ) == 0 ) {
00536 
00537     for ( int i = 0; attr[i]; i += 2 ) {
00538 
00539       // next attribute
00540       if ( strncmp( attr[i], "next", 4 ) == 0 ) {
00541 
00542         search_data._next = atoi( attr[i+1] );
00543       }
00544     }
00545   }
00546   // TaxonOccurrence element
00547   else if ( strncmp( el, "http://rs.tdwg.org/ontology/voc/TaxonOccurrence#/TaxonOccurrence", 64 ) == 0 ) {
00548 
00549     for ( int i = 0; attr[i]; i += 2 ) {
00550 
00551       // rdf:about attribute
00552       if ( strncmp( attr[i], "http://www.w3.org/1999/02/22-rdf-syntax-ns#/about", 49 ) == 0 ) {
00553 
00554         search_data._last_guid = attr[i+1];
00555       }
00556     }
00557   }
00558   // decimalLatitude element
00559   else if ( strncmp( el, "http://rs.tdwg.org/ontology/voc/TaxonOccurrence#/decimalLatitude", 64 ) == 0 ) {
00560 
00561     XML_SetCharacterDataHandler( search_data._parser, &GbifOccurrences::_ltDataHandler );
00562   }
00563   // decimalLongitude element
00564   else if ( strncmp( el, "http://rs.tdwg.org/ontology/voc/TaxonOccurrence#/decimalLongitude", 65 ) == 0 ) {
00565 
00566     XML_SetCharacterDataHandler( search_data._parser, &GbifOccurrences::_lgDataHandler );
00567   }
00568 }
00569 
00570 
00571 /**************************/
00572 /*** end search element ***/
00573 void 
00574 GbifOccurrences::_endSearchElement( void *data, const char *el )
00575 {
00576   // TaxonOccurrence element
00577   if ( strncmp( el, "http://rs.tdwg.org/ontology/voc/TaxonOccurrence#/TaxonOccurrence", 64 ) == 0 ) {
00578 
00579     GbifRecordData& search_data = *( reinterpret_cast<GbifRecordData*>( data ) );
00580 
00581     search_data._occurrences->createOccurrence( search_data._last_guid.c_str(), search_data._last_lg, search_data._last_lt, 0.0, 1, 0, 0 );
00582   }
00583 }
00584 
00585 
00586 /***********************/
00587 /*** lt data handler ***/
00588 void 
00589 GbifOccurrences::_ltDataHandler( void *data, const char *value, int len )
00590 {
00591   GbifRecordData& search_data = *( reinterpret_cast<GbifRecordData*>( data ) );
00592 
00593   std::string lt("");
00594   lt.append( value, len );
00595 
00596   search_data._last_lt = Coord( atof( lt.c_str() ) );
00597 
00598   XML_SetCharacterDataHandler( search_data._parser, NULL );
00599 }
00600 
00601 
00602 /***********************/
00603 /*** lg data handler ***/
00604 void 
00605 GbifOccurrences::_lgDataHandler( void *data, const char *value, int len )
00606 {
00607   GbifRecordData& search_data = *( reinterpret_cast<GbifRecordData*>( data ) );
00608 
00609   std::string lg("");
00610   lg.append( value, len );
00611 
00612   search_data._last_lg = Coord( atof( lg.c_str() ) );
00613 
00614   XML_SetCharacterDataHandler( search_data._parser, NULL );
00615 }