Grinding Java - Searching the WWW in Java:grinding2.java

 import java.net.URL;

interface URLLocatedListener {  public void searchStringFoundInURL(URL url); }

SearchEngine.java

import java.awt.*; import java.awt.event.*; import java.net.*; import java.lang.*; import java.io.*; import java.util.Vector;

import ProgressIndicator;

/** * The SearchEngine class is a node in a web of its own. * A search engine is given a URL to search, it searches it for both the keyword and links, * when it finds the keyword it fires an event and for every link it finds it sends a new * search engine on a different thread. * The search engine tries to preserve memory (it is not very successful at that) and it * is responsible for limiting the number of active threads. When every search engine * terminates it checks for non-active threads and fires as many threads as possible. public class SearchEngine implements Runnable {  /**    * the constructor of the SearchEngine is responsible for setting an environment in which * the thread can exist. It accepts the following parameters. * progressDialog - A dialog to which the status of the search is sent. * theURLWeMustSearch - the url to search. You can use a url to refer to a file on your HD too. * numberOfServersToBranchTo - if this equals 0 then every link that is not of this *   domain will be ignored. If this number is above 0 then every link that is not *   on this server will be branched to with  numberOfServersToBranchTo - 1 as a    *    branching factor. This feature gives us a feel of distance. * The concept of distance can be expanded to include length of URL string or   * number of pages from the source page rather than servers. * searchFor - the search string. You could improve here by accepting a search * structure which will support advanced boolean searches. It should be very * simple to implement. **/  public SearchEngine(ProgressIndicator progressDialog, URL theURLWeMustSearch,                       int numberOfServersToBranchTo, String searchFor) {     URLToSearch = theURLWeMustSearch;

this.progressDialog = progressDialog; progressDialog.addInformation("Searching in: " +                      URLToSearch.toString + " thread count " +                       searchThreads.activeCount);

this.numberOfServersToBranchTo = numberOfServersToBranchTo; this.searchFor = searchFor;

progressDialog.addToListOfPending(URLToSearch.toString); // add this // url to the list of URLs pending search. }

/**   * the run method is the actual method that the Thread class calls. * This method does all the dirty work of searching and launching the new * threads. **/  public void run {

// this code block checks for free memory and tries to delay the thread if too much // memory is occupied. It calls the garbage collector if nothing else helps. try {        for (int counter = 0;              (WWWUtility.runtimeInformation.totalMemory - WWWUtility.runtimeInformation.freeMemory) &gt;             maxAmountOfMemoryToUse;              counter++ ) if(counter != 10) Thread.currentThread.sleep(0,5); // put the thread to sleep for half a millisecond. else {              WWWUtility.runtimeInformation.gc; // invoke the garbage collector. break; }     }      catch(java.lang.InterruptedException e)      { }

runningThreads++; progressDialog.removeFromListOfPending(URLToSearch.toString); // remove the url from the list pending search. progressDialog.addToListOfChecking(URLToSearch.toString); // add the url to the list of currently searching URLs.

URLParaser paraser = new URLParaser(URLToSearch); // create an          // instance of the class url paraser which searches the url for us.

if(paraser.isStringInWWWPage(searchFor)) sendFoundEvent; // send the // event that a url was found.

URL currentLink = getNextURL(paraser); // get the url of the first link in this url. SearchEngine newInstanceOfSearchEngine; boolean isTheCurrentURLOnTheSameServer; // used to determine server hops. while(currentLink != null) // while there are still more links on this page. {        isTheCurrentURLOnTheSameServer = isThisURLOnTheSameServer(currentLink); if((numberOfServersToBranchTo &gt; 0) ||           isTheCurrentURLOnTheSameServer)  // if the current member of                   // the listOfHyperLinks is on the current server {           if(!isTheCurrentURLOnTheSameServer) newInstanceOfSearchEngine = new SearchEngine(progressDialog, currentLink,                                  numberOfServersToBranchTo - 1,searchFor); else newInstanceOfSearchEngine = new SearchEngine(progressDialog, currentLink,                                  numberOfServersToBranchTo,searchFor); // create a new Version of SearchEngine with the same parameters.

Thread t = new Thread(searchThreads,newInstanceOfSearchEngine); // launch the search on a different thread.

if(searchThreads.activeCount <              maximumNumberOfThreads) // If there are already a lot of                                       // threads running ,leave that // new thread suspended. {              t.setPriority(defaultThreadPriority); // the priority should // be set to low if the system is not responsive. t.start; }        }         currentLink = getNextURL(paraser); // get the next link. }

paraser = null; // this will get the garbage collector working quicker.

wakeupSleepingThreads; // this method looks for other threads to launch runningThreads--; progressDialog.removeFromListOfChecking(URLToSearch.toString); }

/**   * this method iterates thru the threads and looks for threads to launch * instead of this thread which will terminate soon. * The reason this method is synchronized is the structure allTheSearchThreads * imagine it filling up with a list of threads and then one of the threads * finishes excecution, the pointer to it would then be null. This caused * me some very annoying exceptions. **/  private synchronized void wakeupSleepingThreads {     try {        if(runningThreads == -1) return; if(searchThreads.activeCount &gt; 1) {           // if the number of total threads is bigger than 1 (this thread included), // then get the list of all the threads in the group and iterate through it. Thread[] allTheSearchThreads = new Thread[searchThreads.activeCount]; searchThreads.enumerate(allTheSearchThreads); int counter = 0; boolean firstRound = true; while(((searchThreads.activeCount < maximumNumberOfThreads) || (firstRound))              && (counter < allTheSearchThreads.length)) {              if(!allTheSearchThreads[counter].isAlive) // check if                                              // the thread is running. {                 firstRound = false; allTheSearchThreads[counter].start; // if its not running then start it. }              counter++; }        }         else sendDoneEvent; // there are no more threads terminate the execution. }     catch(Exception allExceptions) {        System.out.println("A thread exception was thrown with the following" +            " error message: " + allExceptions.getMessage + "\n" +            "Search was terminated.\n" +            "Note: there might not be an error message in the exception."); searchThreads.stop; sendDoneEvent; }  }

/**   * If the URL was already checked return true otherwize false. * iterates through the urls and checks equality. Every url that is checked through * this method will be added to the list. **/  private boolean wasURLFound(URL url) {     URL currentURL; for (int counter = 0 ; counter < visitedURLs.size; counter++) {        currentURL = (URL)visitedURLs.elementAt(counter); if(currentURL.equals(url)) return(true); }     visitedURLs.addElement(url); return(false); }

/**   * This method stops all the SearchEngine threads. * It is called when the stop button is pressed. **/  public void stopSearching {     searchThreads.stop; sendDoneEvent; }

/**   * This method adds a listener to the event of finding a URL. **/  public void addFoundListener(URLLocatedListener eventListener) {     listenersForURLFinding.addElement(eventListener); // add this listener // to our listener list }

/**   * This method notifies all the listeners that we found a url with the * search string. **/  private void sendFoundEvent {     // Calls all the listeners with the URL as parameter. for (int counter = 0;counter < listenersForURLFinding.size; counter++) { ((URLLocatedListener) listenersForURLFinding.elementAt(counter)).searchStringFoundInURL(URLToSearch); }  }

/**   * This method notifies all the listeners that the search is finished * or that it was terminated by the user. **/  private void sendDoneEvent {     runningThreads = -1; // Calls all the listeners with the URL as parameter. for (int counter = 0;counter < listenersForURLFinding.size; counter++) { ((URLLocatedListener) listenersForURLFinding.elementAt(counter)).searchStringFoundInURL(null); }  }

/**   * Returns true if the URL resides on the same server with the URLToSearch. * This is useful to test the distance and jumps between servers. **/  private boolean isThisURLOnTheSameServer(URL url) {     return(url.getHost.equals(URLToSearch.getHost)); }

/**   * returns a url that is a link from the current page. It also tests * to see if we visited that url before. **/  private URL getNextURL(URLParaser paraser) {     URL pointerToNestedPage = paraser.getNextURL; while ((pointerToNestedPage != null) &&            (wasURLFound(pointerToNestedPage))) pointerToNestedPage = paraser.getNextURL; return(pointerToNestedPage); }

// Don't use more than 40% of the memory. private static long maxAmountOfMemoryToUse = (long)(WWWUtility.runtimeInformation.totalMemory / 2.5);

private static final int defaultThreadPriority = Thread.NORM_PRIORITY; // the priority used for the threads.

private static final int maximumNumberOfThreads = 5; // the number of                 // threads that can be running at one time.

private static Vector listenersForURLFinding = new Vector; // the list of                 // the listeners for the search results. private static ThreadGroup searchThreads = new ThreadGroup("Search threads"); // the search threads. private static Vector visitedURLs = new Vector; // URLs that were searched // to avoid duplicating our search.

private int runningThreads = 0; // the number of running threads. private ProgressIndicator progressDialog; // the dialog showing the progress // of the search. private String searchFor; // the search string. private int numberOfServersToBranchTo; // the branching number of servers // to hop to. private URL URLToSearch; // the URL we are currently searching. } 