Grinding Java - Searching the WWW in Java:grinding2.java

From EDM2
Jump to: navigation, search
import java.net.URL;

interface URLLocatedListener
{
   public void searchStringFoundInURL(URL url);
}

SearchEngine.java

import java.awt.*;
import java.awt.event.*;
import java.net.*;
import java.lang.*;
import java.io.*;
import java.util.Vector;

import ProgressIndicator;

/**
 * The SearchEngine class is a node in a web of its own.
 * A search engine is given a URL to search, it searches it for both the keyword and links,
 * when it finds the keyword it fires an event and for every link it finds it sends a new
 * search engine on a different thread.
 * The search engine tries to preserve memory (it is not very successful at that) and it
 * is responsible for limiting the number of active threads. When every search engine
 * terminates it checks for non-active threads and fires as many threads as possible.
**/
public class SearchEngine implements Runnable
{
   /**
    * the constructor of the SearchEngine is responsible for setting an environment in which
    * the thread can exist. It accepts the following parameters.
    * progressDialog - A dialog to which the status of the search is sent.
    * theURLWeMustSearch - the url to search. You can use a url to refer to a file on your HD too.
    * numberOfServersToBranchTo - if this equals 0 then every link that is not of this
    *    domain will be ignored. If this number is above 0 then every link that is not
    *    on this server will be branched to with  numberOfServersToBranchTo - 1 as a
    *    branching factor. This feature gives us a feel of distance.
    * The concept of distance can be expanded to include length of URL string or
    * number of pages from the source page rather than servers.
    * searchFor - the search string. You could improve here by accepting a search
    * structure which will support advanced boolean searches. It should be very
    * simple to implement.
   **/
   public SearchEngine(ProgressIndicator progressDialog, URL theURLWeMustSearch,
                       int numberOfServersToBranchTo, String searchFor)
   {
      URLToSearch = theURLWeMustSearch;

      this.progressDialog = progressDialog;
      progressDialog.addInformation("Searching in: " +
                       URLToSearch.toString() + " thread count " +
                       searchThreads.activeCount());

      this.numberOfServersToBranchTo = numberOfServersToBranchTo;
      this.searchFor = searchFor;

      progressDialog.addToListOfPending(URLToSearch.toString()); // add this
                                  // url to the list of URLs pending search.
   }

   /**
    * the run method is the actual method that the Thread class calls.
    * This method does all the dirty work of searching and launching the new
    * threads.
   **/
   public void run()
   {

      // this code block checks for free memory and tries to delay the thread if too much
      // memory is occupied. It calls the garbage collector if nothing else helps.
      try
      {
         for (int counter = 0;
              (WWWUtility.runtimeInformation.totalMemory() -
                  WWWUtility.runtimeInformation.freeMemory()) >
              maxAmountOfMemoryToUse;
              counter++ )
            if(counter != 10) Thread.currentThread().sleep(0,5);
               // put the thread to sleep for half a millisecond.
            else
            {
               WWWUtility.runtimeInformation.gc(); // invoke the garbage collector.
               break;
            }
      }
      catch(java.lang.InterruptedException e)
      {
      }

      runningThreads++;
      progressDialog.removeFromListOfPending(URLToSearch.toString()); // remove the url from the list pending search.
      progressDialog.addToListOfChecking(URLToSearch.toString()); // add the url to the list of currently searching URLs.

      URLParaser paraser = new URLParaser(URLToSearch); // create an
           // instance of the class url paraser which searches the url for us.

      if(paraser.isStringInWWWPage(searchFor)) sendFoundEvent(); // send the
           // event that a url was found.

      URL currentLink = getNextURL(paraser); // get the url of the first link in this url.
      SearchEngine newInstanceOfSearchEngine;
      boolean isTheCurrentURLOnTheSameServer; // used to determine server hops.
      while(currentLink != null) // while there are still more links on this page.
      {
         isTheCurrentURLOnTheSameServer =
             isThisURLOnTheSameServer(currentLink);
         if((numberOfServersToBranchTo > 0) ||
            isTheCurrentURLOnTheSameServer)  // if the current member of
                   // the listOfHyperLinks is on the current server
         {
            if(!isTheCurrentURLOnTheSameServer)
               newInstanceOfSearchEngine =
                  new SearchEngine(progressDialog, currentLink,
                                   numberOfServersToBranchTo - 1,searchFor);
            else
               newInstanceOfSearchEngine =
                  new SearchEngine(progressDialog, currentLink,
                                   numberOfServersToBranchTo,searchFor);
                  // create a new Version of SearchEngine with the same parameters.

            Thread t = new Thread(searchThreads,newInstanceOfSearchEngine);
                 // launch the search on a different thread.

            if(searchThreads.activeCount() <
               maximumNumberOfThreads) // If there are already a lot of
                                       // threads running ,leave that
                                       // new thread suspended.
            {
               t.setPriority(defaultThreadPriority); // the priority should
                           // be set to low if the system is not responsive.
               t.start();
            }
         }
         currentLink = getNextURL(paraser); // get the next link.
      }

      paraser = null; // this will get the garbage collector working quicker.

      wakeupSleepingThreads(); // this method looks for other threads to launch
      runningThreads--;
      progressDialog.removeFromListOfChecking(URLToSearch.toString());
   }

   /**
    * this method iterates thru the threads and looks for threads to launch
    * instead of this thread which will terminate soon.
    * The reason this method is synchronized is the structure allTheSearchThreads
    * imagine it filling up with a list of threads and then one of the threads
    * finishes excecution, the pointer to it would then be null. This caused
    * me some very annoying exceptions.
   **/
   private synchronized void wakeupSleepingThreads()
   {
      try
      {
         if(runningThreads == -1) return;
         if(searchThreads.activeCount() > 1)
         {
            // if the number of total threads is bigger than 1 (this thread included),
            // then get the list of all the threads in the group and iterate through it.
            Thread[] allTheSearchThreads = new Thread[searchThreads.activeCount()];
            searchThreads.enumerate(allTheSearchThreads);
            int counter = 0;
            boolean firstRound = true;
            while(((searchThreads.activeCount() < maximumNumberOfThreads) || (firstRound))
               && (counter < allTheSearchThreads.length))
            {
               if(!allTheSearchThreads[counter].isAlive()) // check if
                                              // the thread is running.
               {
                  firstRound = false;
                  allTheSearchThreads[counter].start();
                  // if its not running then start it.
               }
               counter++;
            }
         }
         else sendDoneEvent(); // there are no more threads terminate the execution.
      }
      catch(Exception allExceptions)
      {
         System.out.println("A thread exception was thrown with the following" +
            " error message: " + allExceptions.getMessage() + "\n" +
            "Search was terminated.\n" +
            "Note: there might not be an error message in the exception.");
         searchThreads.stop();
         sendDoneEvent();
      }
   }

   /**
    * If the URL was already checked return true otherwize false.
    * iterates through the urls and checks equality. Every url that is checked through
    * this method will be added to the list.
   **/
   private boolean wasURLFound(URL url)
   {
      URL currentURL;
      for (int counter = 0 ; counter < visitedURLs.size(); counter++)
      {
         currentURL = (URL)visitedURLs.elementAt(counter);
         if(currentURL.equals(url)) return(true);
      }
      visitedURLs.addElement(url);
      return(false);
   }

   /**
    * This method stops all the SearchEngine threads.
    * It is called when the stop button is pressed.
   **/
   public void stopSearching()
   {
      searchThreads.stop();
      sendDoneEvent();
   }

   /**
    * This method adds a listener to the event of finding a URL.
   **/
   public void addFoundListener(URLLocatedListener eventListener)
   {
      listenersForURLFinding.addElement(eventListener); // add this listener
                                                        // to our listener list
   }

   /**
    * This method notifies all the listeners that we found a url with the
    * search string.
   **/
   private void sendFoundEvent()
   {
      // Calls all the listeners with the URL as parameter.
      for (int counter = 0;counter < listenersForURLFinding.size(); counter++) {
        ((URLLocatedListener) listenersForURLFinding.elementAt(counter)).searchStringFoundInURL(URLToSearch);
      }
   }

   /**
    * This method notifies all the listeners that the search is finished
    * or that it was terminated by the user.
   **/
   private void sendDoneEvent()
   {
      runningThreads = -1;
      // Calls all the listeners with the URL as parameter.
      for (int counter = 0;counter < listenersForURLFinding.size(); counter++) {
        ((URLLocatedListener) listenersForURLFinding.elementAt(counter)).searchStringFoundInURL(null);
      }
   }

   /**
    * Returns true if the URL resides on the same server with the URLToSearch.
    * This is useful to test the distance and jumps between servers.
   **/
   private boolean isThisURLOnTheSameServer(URL url)
   {
      return(url.getHost().equals(URLToSearch.getHost()));
   }

   /**
    * returns a url that is a link from the current page. It also tests
    * to see if we visited that url before.
   **/
   private URL getNextURL(URLParaser paraser)
   {
      URL pointerToNestedPage = paraser.getNextURL();
      while ((pointerToNestedPage != null) &&
             (wasURLFound(pointerToNestedPage)))
         pointerToNestedPage = paraser.getNextURL();
      return(pointerToNestedPage);
   }

   // Don't use more than 40% of the memory.
   private static long maxAmountOfMemoryToUse =
           (long)(WWWUtility.runtimeInformation.totalMemory() / 2.5);

   private static final int defaultThreadPriority = Thread.NORM_PRIORITY;
                  // the priority used for the threads.

   private static final int maximumNumberOfThreads = 5; // the number of
                  // threads that can be running at one time.

   private static Vector listenersForURLFinding = new Vector(); // the list of
                  // the listeners for the search results.
   private static ThreadGroup searchThreads = new ThreadGroup("Search threads");
                  // the search threads.
   private static Vector visitedURLs = new Vector(); // URLs that were searched
                  // to avoid duplicating our search.

   private int runningThreads = 0; // the number of running threads.
   private ProgressIndicator progressDialog; // the dialog showing the progress
                  // of the search.
   private String searchFor; // the search string.
   private int numberOfServersToBranchTo; // the branching number of servers
                  // to hop to.
   private URL URLToSearch; // the URL we are currently searching.
}