/**
 * Created: May 9, 2007 
 * Updated: November 30, 2009 
 * @author Chris Jarabek (cjjarabe@ucalgary.ca)
 * 
 * Class Description: This abstract class is defined in order to ensure that
 * each plugin class that extends this class behaves in a semi-consistent
 * manner.
 *  
 */
package org.xenbase.scraper;

import java.io.InputStream;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.xenbase.scraper.data.ScrapedData;

public abstract class BasicScraper {
    //Some parsing strings that are used frequently throughout the various scraping modules
    protected String FIGURE_TEXT = "Figure";

    protected String SINGLE_COOKIE_HEADER = "http.protocol.single-cookie-header";
    
    protected String UTF8 = "UTF-8";
    
    protected String QUOTE = "\"";
    
    /**This is the actual function that takes the URL (produced by getRedirURL)
     * and returns the images and captions of that article.  This is the core of the
     * scraper, and obviously each webpage is different, and so different string
     * parsing is done for different journals.
     * 
     * @param String URL - Direct URL to full article (usually produced by getRedirURL(String url)
     * @return ScrapedData - The Object containing all the images and captions
     */
    public abstract ScrapedData scrape(String url) throws Exception, Error;

    /**Because we are using URLs from pubmed and because each journal
     * publisher's website is different, we need to go through a series of HTTP
     * 301 redirects, then search the resulting page to find the URL of the full
     * article.  Because each publisher website is different, this function
     * needs to be unique for each journal publisher website.
     * 
     * @param url - URL to full article from PubMed
     * @return String - Containing actual URL of full journal article    
     */
    public abstract String getRedirURL(String url) throws Exception, Error;
              
    /**Takes a URL in String format and returns a byte array of the contents of the site 
     * at the the URL provided.  This is how pages and images are actually downloaded.  This function
     * makes use of the Apache HttpClient class, as it was one of the few HTTP classes
     * that provides sufficient functionality for browser spoofing which was required 
     * in order to correctly access the journal websites.
     * @param String URL
     * @return byte[] 
     */
    public byte[] getData(String url) throws Exception, Error{
        byte[] b = null;
        try {
            HttpClient httpclient = new HttpClient();
            httpclient.getHttpConnectionManager().getParams().setConnectionTimeout(30000);
            
            //This needs to be set because one of the cookies contains a
            //domain with a period which messes the code up otherwise
            httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

            //Some Journals will only accept a resposne if all the cookies are in one string.
            httpclient.getParams().setParameter("http.protocol.single-cookie-header", new Boolean(true));
            GetMethod httpget = new GetMethod(url);
            @SuppressWarnings("unused")
			int result = httpclient.executeMethod(httpget);
            InputStream is = httpget.getResponseBodyAsStream();
            b = IOUtils.toByteArray(is);
            httpget.releaseConnection();
        } catch (Exception e) {
            System.out.println("Exception in BasicScraper.getData: " + e.getMessage());
            throw e;
        } catch (Error e) {           
            System.out.println("Error in BasicScraper.getData: " + e.getMessage());
            throw e;
        }
        return b;
    }         
}