Grinding Java - Searching the WWW in Java:grinding2.java
From EDM2
import java.net.URL; interface URLLocatedListener { public void searchStringFoundInURL(URL url); } SearchEngine.java import java.awt.*; import java.awt.event.*; import java.net.*; import java.lang.*; import java.io.*; import java.util.Vector; import ProgressIndicator; /** * The SearchEngine class is a node in a web of its own. * A search engine is given a URL to search, it searches it for both the keyword and links, * when it finds the keyword it fires an event and for every link it finds it sends a new * search engine on a different thread. * The search engine tries to preserve memory (it is not very successful at that) and it * is responsible for limiting the number of active threads. When every search engine * terminates it checks for non-active threads and fires as many threads as possible. **/ public class SearchEngine implements Runnable { /** * the constructor of the SearchEngine is responsible for setting an environment in which * the thread can exist. It accepts the following parameters. * progressDialog - A dialog to which the status of the search is sent. * theURLWeMustSearch - the url to search. You can use a url to refer to a file on your HD too. * numberOfServersToBranchTo - if this equals 0 then every link that is not of this * domain will be ignored. If this number is above 0 then every link that is not * on this server will be branched to with numberOfServersToBranchTo - 1 as a * branching factor. This feature gives us a feel of distance. * The concept of distance can be expanded to include length of URL string or * number of pages from the source page rather than servers. * searchFor - the search string. You could improve here by accepting a search * structure which will support advanced boolean searches. It should be very * simple to implement. **/ public SearchEngine(ProgressIndicator progressDialog, URL theURLWeMustSearch, int numberOfServersToBranchTo, String searchFor) { URLToSearch = theURLWeMustSearch; this.progressDialog = progressDialog; progressDialog.addInformation("Searching in: " + URLToSearch.toString() + " thread count " + searchThreads.activeCount()); this.numberOfServersToBranchTo = numberOfServersToBranchTo; this.searchFor = searchFor; progressDialog.addToListOfPending(URLToSearch.toString()); // add this // url to the list of URLs pending search. } /** * the run method is the actual method that the Thread class calls. * This method does all the dirty work of searching and launching the new * threads. **/ public void run() { // this code block checks for free memory and tries to delay the thread if too much // memory is occupied. It calls the garbage collector if nothing else helps. try { for (int counter = 0; (WWWUtility.runtimeInformation.totalMemory() - WWWUtility.runtimeInformation.freeMemory()) > maxAmountOfMemoryToUse; counter++ ) if(counter != 10) Thread.currentThread().sleep(0,5); // put the thread to sleep for half a millisecond. else { WWWUtility.runtimeInformation.gc(); // invoke the garbage collector. break; } } catch(java.lang.InterruptedException e) { } runningThreads++; progressDialog.removeFromListOfPending(URLToSearch.toString()); // remove the url from the list pending search. progressDialog.addToListOfChecking(URLToSearch.toString()); // add the url to the list of currently searching URLs. URLParaser paraser = new URLParaser(URLToSearch); // create an // instance of the class url paraser which searches the url for us. if(paraser.isStringInWWWPage(searchFor)) sendFoundEvent(); // send the // event that a url was found. URL currentLink = getNextURL(paraser); // get the url of the first link in this url. SearchEngine newInstanceOfSearchEngine; boolean isTheCurrentURLOnTheSameServer; // used to determine server hops. while(currentLink != null) // while there are still more links on this page. { isTheCurrentURLOnTheSameServer = isThisURLOnTheSameServer(currentLink); if((numberOfServersToBranchTo > 0) || isTheCurrentURLOnTheSameServer) // if the current member of // the listOfHyperLinks is on the current server { if(!isTheCurrentURLOnTheSameServer) newInstanceOfSearchEngine = new SearchEngine(progressDialog, currentLink, numberOfServersToBranchTo - 1,searchFor); else newInstanceOfSearchEngine = new SearchEngine(progressDialog, currentLink, numberOfServersToBranchTo,searchFor); // create a new Version of SearchEngine with the same parameters. Thread t = new Thread(searchThreads,newInstanceOfSearchEngine); // launch the search on a different thread. if(searchThreads.activeCount() < maximumNumberOfThreads) // If there are already a lot of // threads running ,leave that // new thread suspended. { t.setPriority(defaultThreadPriority); // the priority should // be set to low if the system is not responsive. t.start(); } } currentLink = getNextURL(paraser); // get the next link. } paraser = null; // this will get the garbage collector working quicker. wakeupSleepingThreads(); // this method looks for other threads to launch runningThreads--; progressDialog.removeFromListOfChecking(URLToSearch.toString()); } /** * this method iterates thru the threads and looks for threads to launch * instead of this thread which will terminate soon. * The reason this method is synchronized is the structure allTheSearchThreads * imagine it filling up with a list of threads and then one of the threads * finishes excecution, the pointer to it would then be null. This caused * me some very annoying exceptions. **/ private synchronized void wakeupSleepingThreads() { try { if(runningThreads == -1) return; if(searchThreads.activeCount() > 1) { // if the number of total threads is bigger than 1 (this thread included), // then get the list of all the threads in the group and iterate through it. Thread[] allTheSearchThreads = new Thread[searchThreads.activeCount()]; searchThreads.enumerate(allTheSearchThreads); int counter = 0; boolean firstRound = true; while(((searchThreads.activeCount() < maximumNumberOfThreads) || (firstRound)) && (counter < allTheSearchThreads.length)) { if(!allTheSearchThreads[counter].isAlive()) // check if // the thread is running. { firstRound = false; allTheSearchThreads[counter].start(); // if its not running then start it. } counter++; } } else sendDoneEvent(); // there are no more threads terminate the execution. } catch(Exception allExceptions) { System.out.println("A thread exception was thrown with the following" + " error message: " + allExceptions.getMessage() + "\n" + "Search was terminated.\n" + "Note: there might not be an error message in the exception."); searchThreads.stop(); sendDoneEvent(); } } /** * If the URL was already checked return true otherwize false. * iterates through the urls and checks equality. Every url that is checked through * this method will be added to the list. **/ private boolean wasURLFound(URL url) { URL currentURL; for (int counter = 0 ; counter < visitedURLs.size(); counter++) { currentURL = (URL)visitedURLs.elementAt(counter); if(currentURL.equals(url)) return(true); } visitedURLs.addElement(url); return(false); } /** * This method stops all the SearchEngine threads. * It is called when the stop button is pressed. **/ public void stopSearching() { searchThreads.stop(); sendDoneEvent(); } /** * This method adds a listener to the event of finding a URL. **/ public void addFoundListener(URLLocatedListener eventListener) { listenersForURLFinding.addElement(eventListener); // add this listener // to our listener list } /** * This method notifies all the listeners that we found a url with the * search string. **/ private void sendFoundEvent() { // Calls all the listeners with the URL as parameter. for (int counter = 0;counter < listenersForURLFinding.size(); counter++) { ((URLLocatedListener) listenersForURLFinding.elementAt(counter)).searchStringFoundInURL(URLToSearch); } } /** * This method notifies all the listeners that the search is finished * or that it was terminated by the user. **/ private void sendDoneEvent() { runningThreads = -1; // Calls all the listeners with the URL as parameter. for (int counter = 0;counter < listenersForURLFinding.size(); counter++) { ((URLLocatedListener) listenersForURLFinding.elementAt(counter)).searchStringFoundInURL(null); } } /** * Returns true if the URL resides on the same server with the URLToSearch. * This is useful to test the distance and jumps between servers. **/ private boolean isThisURLOnTheSameServer(URL url) { return(url.getHost().equals(URLToSearch.getHost())); } /** * returns a url that is a link from the current page. It also tests * to see if we visited that url before. **/ private URL getNextURL(URLParaser paraser) { URL pointerToNestedPage = paraser.getNextURL(); while ((pointerToNestedPage != null) && (wasURLFound(pointerToNestedPage))) pointerToNestedPage = paraser.getNextURL(); return(pointerToNestedPage); } // Don't use more than 40% of the memory. private static long maxAmountOfMemoryToUse = (long)(WWWUtility.runtimeInformation.totalMemory() / 2.5); private static final int defaultThreadPriority = Thread.NORM_PRIORITY; // the priority used for the threads. private static final int maximumNumberOfThreads = 5; // the number of // threads that can be running at one time. private static Vector listenersForURLFinding = new Vector(); // the list of // the listeners for the search results. private static ThreadGroup searchThreads = new ThreadGroup("Search threads"); // the search threads. private static Vector visitedURLs = new Vector(); // URLs that were searched // to avoid duplicating our search. private int runningThreads = 0; // the number of running threads. private ProgressIndicator progressDialog; // the dialog showing the progress // of the search. private String searchFor; // the search string. private int numberOfServersToBranchTo; // the branching number of servers // to hop to. private URL URLToSearch; // the URL we are currently searching. }