Grinding Java - Searching the WWW in Java:grinding3.java

import java.awt.*;
import java.awt.event.*;
import java.net.*;
import java.lang.*;
import java.io.*;
import java.util.Vector;

/**
 * the URLParaser class loads the URL to memory and searches through it for the
 * search string and the links in it.
 * It allows us to iterate through the links in the URL.
**/
public class URLParaser
{
   /**
    * the constructor takes a parameter for a url to parase.
   **/
   public URLParaser(URL url)
   {
      loadURL(url);
      findLinksInURL();
      while (!ready);
   }

   /**
    * get the next url in the list of links from the url we are searching.
   **/
   synchronized public URL getNextURL()
   {
      while (!ready);
      if(listOfLinks.size() == currentURLPointer) return (null);
      URL returnValue = (URL)listOfLinks.elementAt(currentURLPointer);
      currentURLPointer++;
      return(returnValue);
   }

   /**
    * returns true if this string is in the url.
   **/
   public boolean isStringInWWWPage(String s)
   {
      int index = textOfTheURL.toLowerCase().indexOf(s.toLowerCase());
      return(index >= 0);
   }

   /**
    * reads a url into memory and sores it as one string.
    * The limit on the size of a string is actually quite high.
   **/
   private void loadURL(URL url)
   {
      URLPointer = url;
      StringBuffer bufferOfTheURL = new StringBuffer("");
      try
      {
         URLConnection URLConnection = url.openConnection();
         InputStream URLStream = URLConnection.getInputStream();
         byte[] buffer = new byte [1000];
         char[] charBuffer = new char [1000];
         int readResult = 0;
         while(readResult != -1)
         {
            readResult = URLStream.read(buffer);
            for (int counter = 0; counter < charBuffer.length; counter++) charBuffer[counter] = (char)buffer[counter];
            bufferOfTheURL.append(charBuffer);
         }
      }
      catch (IOException e)
      {
         System.out.println("Error in reading URL: " + url.toString());
      }
      textOfTheURL = bufferOfTheURL.toString().trim();
   }

   /**
    * find the html BASE HREF command.
    * This command points to the string which is the base server for the url, every link
    * in the URL should have this string prepended to it (appended at the start.).
    * This method sets the currentToken member to the string of the base href.
   **/
   private int getBaseHREF(int positionOfToken)
   {
      positionOfToken = lowerCaseTextOfTheURL.indexOf("base href",positionOfToken);

      if (positionOfToken == -1) return(-1);

      positionOfToken = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken);

      if (positionOfToken == -1) return(-1);

      int returnValue = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken + 1);

      if (returnValue < positionOfToken) return(-1);

      currentToken = textOfTheURL.substring(positionOfToken + 1, returnValue);
      return(returnValue);
   }

   /**
    * find the html A HREF command.
    * This command indicates a link and this method finds its position in the file
    * and sets the currentToken member to the string of the current link.
   **/
   private int getHREF(int positionOfToken)
   {
      positionOfToken = lowerCaseTextOfTheURL.indexOf("a href",positionOfToken);
      if (positionOfToken == -1) return(-1);
      positionOfToken = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken);
      if (positionOfToken == -1) return(-1);
      int returnValue = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken + 1);
      if (returnValue < positionOfToken) return(-1);
      currentToken = textOfTheURL.substring(positionOfToken + 1, returnValue);
      return(returnValue);
   }

   /**
    * findLinksInURL looks for all the links in textOfTheURL
    * and adds every link to the listOfLinks.
    * This method look for all the tokens in the HTML file and
    * filters out all those that do not contain an HREF command
    * (an HTML link).
    * BTW I usually use an HTML WYSIWYG editor for HTML authoring
    * so please forgive me if I make any small mistake (but do notify me).
   **/
   private void findLinksInURL()
   {
      String URLString = URLPointer.toString();
      baseServer = URLString.substring(0,(URLString.lastIndexOf("/")));
            // Base server defaults to the server of the URL.
            // If there a BASE HREF command then the baseServer changes
            // to reflect that address.

      lowerCaseTextOfTheURL = textOfTheURL.toLowerCase();

      if (getBaseHREF(0) != -1) // is there a base server for the document

         baseServer = currentToken;

      if (baseServer.endsWith("/"))
         baseServer = baseServer.substring(0,baseServer.length() - 1);
                      // remove the end.

      int positionOfHREF;
      for (int HTMLIterator = textOfTheURL.indexOf("<",0) ; HTMLIterator != -1;
           HTMLIterator = textOfTheURL.indexOf("<",HTMLIterator + 1))
      {
         try
         {
            currentToken = null;
            HTMLIterator = getHREF(HTMLIterator);
            if(HTMLIterator == -1) break;
         }
         catch(java.lang.StringIndexOutOfBoundsException err)
         {
            break;
         }
         try
         {
            addToListOfLinks(currentToken);
         }
         catch(java.lang.StringIndexOutOfBoundsException err)
         {
            System.out.println("Serious error in token... " + currentToken);
         }
      }
      ready = true;
   }

   /**
    * This method is called by findLinksInURL to add an HREF style link
    * to the list of the links which will then be used to iterate through the
    * list of links in this URL.
   **/
   private void addToListOfLinks(String token)
   {
      try
      {
         if (token.startsWith("ftp") || token.startsWith("mailto") ||
             token.startsWith("news"))
            return;

         if (!(token.endsWith("htm") || token.endsWith("html")))
            return;

         if(!token.startsWith("http"))
            if (!token.startsWith(URLPointer.getProtocol()))
               if(token.startsWith("/"))
                  token = baseServer.concat(token);
               else
                  token = baseServer.concat("/" + token);


         // an exception might be thrown when creating a new URL.
         listOfLinks.addElement(new URL(token));
      }
      catch(MalformedURLException err)
      {
         System.out.println("Encountered a bad link in the destination HTML document: \n" + token);
      }
      catch(java.lang.StringIndexOutOfBoundsException err)
      {
         System.out.println("Error in token... " + token);
      }
   }

   private boolean ready = false; // indicates when the URL is loaded and parsed.
   private String baseServer; // Base server defaults
            // to the server of the URL. If there a BASE HREF command then the
            // baseServer changes to reflect that address.
   private String currentToken; // used to store the current href we are working on.
   private URL URLPointer; // The URL where we started our search.
   private int currentURLPointer = 0; // used to iterate through the list of links.
   private String textOfTheURL; // the full text of the page we are searching.
   private String lowerCaseTextOfTheURL; // the lowercased the full text of the
                                         // page we are searching.
   private boolean wasStringFound = false; // true if the search string was found.
   private Vector listOfLinks = new Vector(); // the list of links in this URL
                                              // (filled by findLinksInURL).
}