00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "omgscrapergbif.h"
00022 #include "omgwebpagefetcher.h"
00023 #include "omgui.h"
00024
00025
00026 #include <QTimer>
00027 #include <QRegExp>
00028 #include <QDebug>
00029 #include <QFileInfo>
00030 #include <QFile>
00031 #include <QTextStream>
00032
00033 #include <QtPlugin>
00034 OmgScraperGbif::OmgScraperGbif(QObject * parent) :
00035 QObject(parent),
00036 OmgScraperPluginInterface()
00037 {
00038 qDebug("OmgScraperGbif constructor called...");
00039 }
00040
00041
00042 OmgScraperGbif::~OmgScraperGbif()
00043 {
00044 }
00045
00046 const QString OmgScraperGbif::getName()
00047 {
00048 return QString("GBIF Plugin");
00049 }
00050 const QString OmgScraperGbif::getLicense()
00051 {
00052 QFile myQFile( ":/gbif_terms.txt" );
00053 QString myString;
00054 if ( myQFile.open( QIODevice::ReadOnly ) )
00055 {
00056
00057 QTextStream myStream( &myQFile );
00058 myString = myStream.readAll();
00059 myQFile.close();
00060 }
00061 else
00062 {
00063 myString="Terms and conditions document for GBIF could not be retrieved.";
00064 }
00065 return myString;
00066 }
00067
00068 bool OmgScraperGbif::search(QString theTaxonName, QString theFileName)
00069 {
00070
00071 mTaxonName = theTaxonName;
00072 mFileName = theFileName;
00073
00074 if (mTaxonName.isEmpty())
00075 {
00076 mMessenger.emitError("Taxon name is empty!");
00077 return false;
00078 }
00079
00080 if (mFileName.isEmpty())
00081 {
00082 mMessenger.emitError("File name is empty!");
00083 return false;
00084 }
00085
00086 mTaxonName = theTaxonName;
00087 mFileName = theFileName;
00088 QString mySearchString=theTaxonName.simplified();
00089 mySearchString=mySearchString.replace(" ","+");
00090 QString myUrlString = "http://www.secretariat.gbif.net/portal/ecat_search.jsp?";
00091 myUrlString += "termsAccepted=true&search=";
00092 myUrlString += mySearchString;
00093 myUrlString += "&countryKey=0&searchType=1&searchMode=1";
00094
00095 qDebug("\n\n ------------------------ part 1 of gbif search ------------------- \n\n");
00096 qDebug ("URL for Search string = " + myUrlString.toLocal8Bit());
00097 OmgWebPageFetcher myWebPageFetcher;
00098 connect(&myWebPageFetcher, SIGNAL(statusChanged(QString)),
00099 this, SLOT(setStatus(QString)));
00100 QString myFirstPage = myWebPageFetcher.getPage(myUrlString);
00101 QString mySecondUrl = taxonIdRequestDone(myFirstPage);
00102 if (!mySecondUrl.isEmpty())
00103 {
00104
00105 OmgWebPageFetcher myWebPageFetcher2;
00106 connect(&myWebPageFetcher2, SIGNAL(statusChanged(QString)),
00107 this, SLOT(setStatus(QString)));
00108 qDebug("\n\n ------------------------ part 2 of gbif search ------------------- \n\n");
00109 qDebug("URL for Search Result string = " + mySecondUrl.toLocal8Bit());
00110 QString mySecondPage = myWebPageFetcher2.getPage(mySecondUrl);
00111 localitiesRequestDone(mySecondPage);
00112 }
00113 return true;
00114 }
00115
00116
00117
00118
00119
00120
00121 QString OmgScraperGbif::taxonIdRequestDone(QString theString)
00122 {
00123 if (theString.isEmpty())
00124 {
00125 mMessenger.emitMessage(tr("Error getting species id"));
00126 mMessenger.emitFileNotWritten(mTaxonName);
00127 return false;
00128 }
00129 QRegExp myPhraseRegex( "taxonKey=[0-9]*&" );
00130 QRegExp myIdRegex( "[0-9]*" );
00131
00132
00133
00134
00135 if (!myPhraseRegex.isValid())
00136 {
00137 mMessenger.emitError(tr("Gbif scraper:: The taxon id regex is invalid. Fix it and try again" ));
00138 return false;
00139 }
00140 else
00141 {
00142
00143
00144
00145
00146 myPhraseRegex.setMinimal(true);
00147 }
00148
00149
00150
00151 if (!myIdRegex.isValid())
00152 {
00153 mMessenger.emitError(tr("Gbif scraper:: The taxon id 'only' regex is invalid. Fix it and try again" ));
00154 return false;
00155 }
00156 else
00157 {
00158
00159
00160
00161
00162 myIdRegex.setMinimal(true);
00163 }
00164
00165
00166
00167
00168
00169
00170
00171 int myPosInt = myPhraseRegex.indexIn( theString,0 );
00172
00173 if ( myPosInt >= 0 )
00174 {
00175 QString myPhraseLineString =
00176 theString.mid(myPosInt,myPhraseRegex.matchedLength());
00177 QString myIdString = myPhraseLineString;
00178 int myIdPosInt = myIdRegex.indexIn( myPhraseLineString,0);
00179 if ( myIdPosInt >= 0 )
00180 {
00181
00182
00183
00184 myIdString = myIdString.replace("taxonKey=","");
00185 myIdString = myIdString.replace("&","");
00186 qDebug() << "Taxon id found! : " << myIdString ;
00187
00188
00189
00190 QString myUrl = "http://www.secretariat.gbif.net/portal/download_issue.jsp?";
00191 myUrl += "termsAccepted=true&";
00192 myUrl += "taxonKey=" + myIdString + "&";
00193 myUrl += "countryKey=0&";
00194 myUrl += "resourceKey=0&";
00195 myUrl += "georeferencedOnly=false&";
00196 myUrl += "concepts=DateLastModified&";
00197 myUrl += "concepts=InstitutionCode&";
00198 myUrl += "concepts=CollectionCode&";
00199 myUrl += "concepts=CatalogNumber&";
00200 myUrl += "concepts=ScientificName&";
00201 myUrl += "concepts=Genus&";
00202 myUrl += "concepts=Species&";
00203 myUrl += "concepts=Subspecies&";
00204 myUrl += "concepts=YearCollected&";
00205 myUrl += "concepts=Country&";
00206 myUrl += "concepts=StateProvince&";
00207 myUrl += "concepts=County&";
00208 myUrl += "concepts=Locality&";
00209 myUrl += "concepts=Longitude&";
00210 myUrl += "concepts=Latitude&";
00211 myUrl += "concepts=Notes&indexOnly=true&";
00212 myUrl += "format=0&";
00213 myUrl += "download=Accept+terms";
00214
00215 return myUrl;
00216 }
00217 else
00218 {
00219 mMessenger.emitFileNotWritten(mTaxonName);
00220 return QString();
00221 }
00222 }
00223
00224 qDebug ("No id found for name, cancelling search for this taxon!");
00225
00226 mMessenger.emitFileNotWritten(mTaxonName);
00227 return QString();
00228 }
00229
00230 bool OmgScraperGbif::localitiesRequestDone(QString theString)
00231 {
00232
00233 if (theString.isEmpty())
00234 {
00235 qDebug ("Localities request returned empty");
00236 mMessenger.emitError("Localities request returned empty");
00237 return false;
00238 }
00239
00240
00241
00242
00243
00244
00245 QRegExp myRecordQRegExp(".*\n" );
00246
00247
00248
00249
00250
00251
00252 if (!myRecordQRegExp.isValid())
00253 {
00254 mMessenger.emitError(tr( "GBIF scraper:: the record regex is invalid. Fix it and try again" ));
00255 }
00256 else
00257 {
00258
00259
00260
00261
00262 myRecordQRegExp.setMinimal(true);
00263 }
00264
00265
00266
00267 int myPosInt = 0;
00268 int myCountInt = 0;
00269 bool myFirstRowFlag=true;
00270 while ( myPosInt >= 0 )
00271 {
00272 myPosInt = myRecordQRegExp.indexIn( theString,myPosInt );
00273
00274 QString myGenusString = "";
00275 QString mySpeciesString = "";
00276 QString mySubSpeciesString = "";
00277 QString myLatitudeString = "";
00278 QString myLongitudeString = "";
00279 QString myInstitutionString = "";
00280 QString myCollectionCodeString = "";
00281 QString myAccessionCodeString = "";
00282
00283 if ( myPosInt >= 0 )
00284 {
00285
00286 QString myRecordQString = theString.mid(myPosInt,myRecordQRegExp.matchedLength());
00287
00288 myPosInt += myRecordQRegExp.matchedLength();
00289
00290
00291 if (myFirstRowFlag==true)
00292 {
00293 myFirstRowFlag=false;
00294 continue;
00295 }
00296
00297 QStringList myStringArray = myRecordQString.split("\t");
00298
00299
00300
00301
00302
00303
00304
00305 if (myStringArray.size() < 15)
00306 {
00307 continue;
00308 }
00309 else
00310 {
00311 myGenusString = myStringArray[5];
00312 mySpeciesString = myStringArray[6];
00313 mySubSpeciesString = myStringArray[7];
00314 myLongitudeString = myStringArray[13];
00315 myLatitudeString = myStringArray[14];
00316 myInstitutionString = myStringArray[1];
00317 myCollectionCodeString = myStringArray[2];
00318 myAccessionCodeString = myStringArray[3];
00319 }
00320
00321 if (!myGenusString.isEmpty() &&
00322 !mySpeciesString.isEmpty() &&
00323 !myLongitudeString.isEmpty() &&
00324 !myLatitudeString.isEmpty())
00325 {
00326 mMessenger.emitMessage( myGenusString + " " + mySpeciesString + ", " + myLatitudeString + ", " + myLongitudeString);
00327
00328 OmgLocality myLocality;
00329 QString myId = myInstitutionString + "_" + myCollectionCodeString + "_" + myAccessionCodeString;
00330 myId = myId.replace(" ","");
00331 myId = Omgui::xmlEncode(myId);
00332 myLocality.setId(myId);
00333 myLocality.setLabel(myGenusString + " " +mySpeciesString);
00334 myLocality.setLatitude(myLatitudeString.toFloat());
00335 myLocality.setLongitude(myLongitudeString.toFloat());
00336 if (!myLocality.isValid())
00337 {
00338 continue;
00339 }
00340 mLocalityVector.push_back(myLocality);
00341 myCountInt++;
00342 }
00343 }
00344 }
00345 qDebug() << myCountInt << " records found" ;
00346
00347
00348
00349
00350
00351 QString myTextFileName = createTextFile(mFileName);
00352 if (myTextFileName.isEmpty())
00353 {
00354 mMessenger.emitFileNotWritten(mTaxonName);
00355 }
00356 else
00357 {
00358 createShapefile(mFileName);
00359 mMessenger.emitFileWritten(mFileName, myTextFileName,mTaxonName,myCountInt);
00360 }
00362 mLocalityVector.clear();
00363 qDebug("\n\n ------------------------ end of part 2 of gbif search ------------------- \n\n");
00364 return true;
00365 }
00366 Q_EXPORT_PLUGIN2(gbif_scraper_plugin, OmgScraperGbif );