Main Page | Modules | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members | Related Pages

omgscrapergbif.cpp

Go to the documentation of this file.
00001 /***************************************************************************
00002  *   Copyright (C) 2005 by Tim Sutton   *
00003  *   tim@linfiniti.com   *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU General Public License as published by  *
00007  *   the Free Software Foundation; either version 2 of the License, or     *
00008  *   (at your option) any later version.                                   *
00009  *                                                                         *
00010  *   This program is distributed in the hope that it will be useful,       *
00011  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00012  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00013  *   GNU General Public License for more details.                          *
00014  *                                                                         *
00015  *   You should have received a copy of the GNU General Public License     *
00016  *   along with this program; if not, write to the                         *
00017  *   Free Software Foundation, Inc.,                                       *
00018  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
00019  ***************************************************************************/
00020 
00021 #include "omgscrapergbif.h"
00022 #include "omgwebpagefetcher.h"
00023 #include "omgui.h"
00024 
00025 //QT Includes
00026 #include <QTimer>
00027 #include <QRegExp>
00028 #include <QDebug>
00029 #include <QFileInfo>
00030 #include <QFile>
00031 #include <QTextStream>
00032 //needed for Q_EXPORT_PLUGIN macro at the end of this file
00033 #include <QtPlugin> 
00034 OmgScraperGbif::OmgScraperGbif(QObject * parent) : 
00035   QObject(parent),
00036   OmgScraperPluginInterface()
00037 {
00038   qDebug("OmgScraperGbif constructor called...");
00039 }
00040 
00041 
00042 OmgScraperGbif::~OmgScraperGbif()
00043 {
00044 }
00045 
00046 const QString  OmgScraperGbif::getName() 
00047 { 
00048   return QString("GBIF Plugin"); 
00049 }
00050 const QString OmgScraperGbif::getLicense()
00051 {
00052   QFile myQFile( ":/gbif_terms.txt" );
00053   QString myString;
00054   if ( myQFile.open( QIODevice::ReadOnly ) ) 
00055   {
00056     //now we parse the loc file, checking each line for its taxon
00057     QTextStream myStream( &myQFile );
00058     myString = myStream.readAll();
00059     myQFile.close();
00060   }
00061   else
00062   {
00063     myString="Terms and conditions document for GBIF could not be retrieved.";
00064   }
00065   return myString; 
00066 }
00067 
00068 bool OmgScraperGbif::search(QString theTaxonName, QString theFileName)
00069 {
00070 
00071   mTaxonName = theTaxonName;
00072   mFileName = theFileName;
00073 
00074   if (mTaxonName.isEmpty())
00075   {
00076     mMessenger.emitError("Taxon name is empty!");
00077     return false;
00078   }
00079 
00080   if (mFileName.isEmpty())
00081   {
00082     mMessenger.emitError("File name is empty!");
00083     return false;
00084   }
00085 
00086   mTaxonName = theTaxonName;
00087   mFileName = theFileName;
00088   QString mySearchString=theTaxonName.simplified();
00089   mySearchString=mySearchString.replace(" ","+");
00090   QString myUrlString = "http://www.secretariat.gbif.net/portal/ecat_search.jsp?";
00091   myUrlString += "termsAccepted=true&search=";
00092   myUrlString += mySearchString;
00093   myUrlString += "&countryKey=0&searchType=1&searchMode=1";
00094 
00095   qDebug("\n\n ------------------------ part 1 of gbif search ------------------- \n\n");
00096   qDebug ("URL for Search string = " + myUrlString.toLocal8Bit());
00097   OmgWebPageFetcher myWebPageFetcher;
00098   connect(&myWebPageFetcher, SIGNAL(statusChanged(QString)),
00099           this, SLOT(setStatus(QString)));
00100   QString myFirstPage = myWebPageFetcher.getPage(myUrlString);
00101   QString mySecondUrl = taxonIdRequestDone(myFirstPage);
00102   if (!mySecondUrl.isEmpty())
00103   {
00104     //ideally we should just be able to reuse the previously made fetcher
00105     OmgWebPageFetcher myWebPageFetcher2;
00106     connect(&myWebPageFetcher2, SIGNAL(statusChanged(QString)),
00107         this, SLOT(setStatus(QString)));
00108     qDebug("\n\n ------------------------ part 2 of gbif search ------------------- \n\n");
00109     qDebug("URL for Search Result string = " + mySecondUrl.toLocal8Bit());
00110     QString mySecondPage = myWebPageFetcher2.getPage(mySecondUrl);
00111     localitiesRequestDone(mySecondPage);
00112   }
00113   return true;
00114 }
00115 
00116 
00117 //
00118 // This is run once the web page has been retrieved.
00119 //
00120 
00121 QString OmgScraperGbif::taxonIdRequestDone(QString theString)
00122 {
00123   if (theString.isEmpty())
00124   {
00125     mMessenger.emitMessage(tr("Error getting species id"));
00126     mMessenger.emitFileNotWritten(mTaxonName);
00127     return false;
00128   }
00129   QRegExp myPhraseRegex( "taxonKey=[0-9]*&" );
00130   QRegExp myIdRegex( "[0-9]*" );
00131 
00132   //
00133   // Taxon id regex
00134   //
00135   if (!myPhraseRegex.isValid())
00136   {
00137     mMessenger.emitError(tr("Gbif scraper:: The taxon id regex is invalid. Fix it and try again" ));
00138     return false;
00139   }
00140   else
00141   {
00142     //make sure greedy matches are off so if you have a string like
00143     // <b>blah</b><b>blahblah</b>
00144     // and your regex is <b>.*</b>
00145     // non greedy match will return <b>blah</b>
00146     myPhraseRegex.setMinimal(true);
00147   }
00148   //
00149   // Taxon id only regex (for trimming out just the numbers from above
00150   //
00151   if (!myIdRegex.isValid())
00152   {
00153     mMessenger.emitError(tr("Gbif scraper:: The taxon id 'only' regex  is invalid. Fix it and try again" ));
00154     return false;
00155   }
00156   else
00157   {
00158     //make sure greedy matches are off so if you have a string like
00159     // <b>blah</b><b>blahblah</b>
00160     // and your regex is <b>.*</b>
00161     // non greedy match will return <b>blah</b>
00162     myIdRegex.setMinimal(true);
00163   }
00164 
00165   //
00166   // Main parsing loop
00167   //
00168 
00169   //new way of doing a qt4 regexp.search
00170   //a little test to see if we can get the taxon id out ok
00171   int  myPosInt = myPhraseRegex.indexIn( theString,0 );
00172 
00173   if ( myPosInt >= 0 )
00174   {
00175     QString myPhraseLineString =
00176       theString.mid(myPosInt,myPhraseRegex.matchedLength());
00177     QString myIdString = myPhraseLineString;
00178     int myIdPosInt = myIdRegex.indexIn( myPhraseLineString,0);
00179     if ( myIdPosInt >= 0 )
00180     {
00181       //this wont work for me
00182       //QString myIdString = myPhraseLineString.mid(myIdPosInt,myIdRegex.matchedLength());
00183       //so I do this kludge
00184       myIdString = myIdString.replace("taxonKey=","");
00185       myIdString = myIdString.replace("&","");
00186       qDebug() <<   "Taxon id found! : " << myIdString ;
00187       //
00188       // Set the next part of our query in motion
00189       //
00190       QString myUrl = "http://www.secretariat.gbif.net/portal/download_issue.jsp?";
00191       myUrl += "termsAccepted=true&";
00192       myUrl += "taxonKey=" + myIdString + "&";
00193       myUrl += "countryKey=0&";
00194       myUrl += "resourceKey=0&";
00195       myUrl += "georeferencedOnly=false&";
00196       myUrl += "concepts=DateLastModified&";
00197       myUrl += "concepts=InstitutionCode&";
00198       myUrl += "concepts=CollectionCode&";
00199       myUrl += "concepts=CatalogNumber&";
00200       myUrl += "concepts=ScientificName&";
00201       myUrl += "concepts=Genus&";
00202       myUrl += "concepts=Species&";
00203       myUrl += "concepts=Subspecies&";
00204       myUrl += "concepts=YearCollected&";
00205       myUrl += "concepts=Country&";
00206       myUrl += "concepts=StateProvince&";
00207       myUrl += "concepts=County&";
00208       myUrl += "concepts=Locality&";
00209       myUrl += "concepts=Longitude&";
00210       myUrl += "concepts=Latitude&";
00211       myUrl += "concepts=Notes&indexOnly=true&";
00212       myUrl += "format=0&";
00213       myUrl += "download=Accept+terms";
00214       
00215       return myUrl;
00216     }
00217     else
00218     {
00219       mMessenger.emitFileNotWritten(mTaxonName);
00220       return QString();
00221     }
00222   }
00223   //rather dont do this as it blocks the ui
00224   qDebug ("No id found for name, cancelling search for this taxon!");
00225   //emit error("Gbif scraper:: The taxon id couldnt find a match in the page returned." );
00226   mMessenger.emitFileNotWritten(mTaxonName);
00227   return QString();
00228 }
00229 
00230 bool OmgScraperGbif::localitiesRequestDone(QString theString)
00231 {
00232   //qDebug() << theString;
00233   if (theString.isEmpty())
00234   {
00235     qDebug ("Localities request returned empty");
00236     mMessenger.emitError("Localities request returned empty");
00237     return false;
00238   }
00239 
00240   // System.out.println(myInputLineString);
00241   // tokenise each line that comes back and search through the
00242   // tokens...
00243 
00244   //now we parse the file looking for lat long occurrences
00245   QRegExp myRecordQRegExp(".*\n" );
00246   //
00247   //check each regex is valid and if not bail out
00248   //
00249   //
00250   // Record
00251   //
00252   if (!myRecordQRegExp.isValid())
00253   {
00254     mMessenger.emitError(tr( "GBIF scraper:: the record regex is invalid. Fix it and try again" ));
00255   }
00256   else
00257   {
00258     //make sure greedy matches are off so if you have a string like
00259     // <b>blah</b><b>blahblah</b>
00260     // and your regex is <b>.*</b>
00261     // non greedy match will return <b>blah</b>
00262     myRecordQRegExp.setMinimal(true);
00263   }
00264   //
00265   // Main parsing loop
00266   //
00267   int myPosInt = 0;    // where we are in the string
00268   int myCountInt = 0;  // how many matches we find
00269   bool myFirstRowFlag=true;
00270   while ( myPosInt >= 0 )
00271   {
00272     myPosInt = myRecordQRegExp.indexIn( theString,myPosInt );
00273 
00274     QString myGenusString = "";
00275     QString mySpeciesString = "";
00276     QString mySubSpeciesString = "";
00277     QString myLatitudeString = "";
00278     QString myLongitudeString = "";
00279     QString myInstitutionString = "";
00280     QString myCollectionCodeString = "";
00281     QString myAccessionCodeString = "";
00282 
00283     if ( myPosInt >= 0 )
00284     {
00285       //qDebug() <<   "Match found from pos " << myPosInt << " to " << myPosInt << myRecordQRegExp.matchedLength() ;
00286       QString myRecordQString =  theString.mid(myPosInt,myRecordQRegExp.matchedLength());
00287       //skip the length of the matched string
00288       myPosInt += myRecordQRegExp.matchedLength();
00289 
00290       //first row is header
00291       if (myFirstRowFlag==true)
00292       {
00293         myFirstRowFlag=false;
00294         continue;
00295       }
00296 
00297       QStringList myStringArray = myRecordQString.split("\t");
00298       //some debugging:
00299       //QListIterator<QString> i(myStringArray);
00300       //while (i.hasNext())
00301       //{
00302       //  qDebug() << i.next() << endl;
00303       //}
00304 
00305       if (myStringArray.size() < 15)
00306       {
00307         continue;
00308       }
00309       else
00310       {
00311         myGenusString = myStringArray[5];
00312         mySpeciesString = myStringArray[6];
00313         mySubSpeciesString = myStringArray[7];
00314         myLongitudeString = myStringArray[13];
00315         myLatitudeString = myStringArray[14];
00316         myInstitutionString =  myStringArray[1];
00317         myCollectionCodeString = myStringArray[2];
00318         myAccessionCodeString = myStringArray[3];
00319       }
00320       //create a OmgLocality struct and add it to the vector
00321       if (!myGenusString.isEmpty() && 
00322               !mySpeciesString.isEmpty() && 
00323               !myLongitudeString.isEmpty() && 
00324               !myLatitudeString.isEmpty())
00325       {
00326         mMessenger.emitMessage( myGenusString  + " " + mySpeciesString + ", " + myLatitudeString  + ", " + myLongitudeString);
00327 
00328         OmgLocality myLocality;
00329         QString myId = myInstitutionString + "_" + myCollectionCodeString + "_" + myAccessionCodeString;
00330         myId = myId.replace(" ","");
00331         myId = Omgui::xmlEncode(myId);
00332         myLocality.setId(myId);
00333         myLocality.setLabel(myGenusString + " " +mySpeciesString);
00334         myLocality.setLatitude(myLatitudeString.toFloat());
00335         myLocality.setLongitude(myLongitudeString.toFloat());
00336         if (!myLocality.isValid())
00337         {
00338           continue;
00339         }
00340         mLocalityVector.push_back(myLocality);
00341         myCountInt++;    // count the number of matches
00342       }
00343     }
00344   }
00345   qDebug() <<   myCountInt << " records found" ;
00346 
00347 
00348   //
00349   // Now build the shapefile
00350   //
00351   QString myTextFileName = createTextFile(mFileName);
00352   if (myTextFileName.isEmpty())
00353   {
00354     mMessenger.emitFileNotWritten(mTaxonName);
00355   }
00356   else
00357   {
00358     createShapefile(mFileName);
00359     mMessenger.emitFileWritten(mFileName, myTextFileName,mTaxonName,myCountInt);
00360   }
00362   mLocalityVector.clear();
00363   qDebug("\n\n ------------------------ end of part 2 of gbif search ------------------- \n\n");
00364   return true;
00365 }
00366 Q_EXPORT_PLUGIN2(gbif_scraper_plugin, OmgScraperGbif );

Generated on Mon Apr 28 15:09:32 2008 for openModellerDesktop by  doxygen 1.4.1-20050210