Grinding Java - Searching the WWW in Java:grinding3.java

 import java.awt.*; import java.awt.event.*; import java.net.*; import java.lang.*; import java.io.*; import java.util.Vector;

/** * the URLParaser class loads the URL to memory and searches through it for the * search string and the links in it. * It allows us to iterate through the links in the URL. public class URLParaser {  /**    * the constructor takes a parameter for a url to parase. **/  public URLParaser(URL url) {     loadURL(url); findLinksInURL; while (!ready); }

/**   * get the next url in the list of links from the url we are searching. **/  synchronized public URL getNextURL {     while (!ready); if(listOfLinks.size == currentURLPointer) return (null); URL returnValue = (URL)listOfLinks.elementAt(currentURLPointer); currentURLPointer++; return(returnValue); }

/**   * returns true if this string is in the url. **/  public boolean isStringInWWWPage(String s)   { int index = textOfTheURL.toLowerCase.indexOf(s.toLowerCase); return(index >= 0); }

/**   * reads a url into memory and sores it as one string. * The limit on the size of a string is actually quite high. **/  private void loadURL(URL url) {     URLPointer = url; StringBuffer bufferOfTheURL = new StringBuffer(""); try {        URLConnection URLConnection = url.openConnection; InputStream URLStream = URLConnection.getInputStream; byte[] buffer = new byte [1000]; char[] charBuffer = new char [1000]; int readResult = 0; while(readResult != -1) {           readResult = URLStream.read(buffer); for (int counter = 0; counter < charBuffer.length; counter++) charBuffer[counter] = (char)buffer[counter]; bufferOfTheURL.append(charBuffer); }     }      catch (IOException e)      { System.out.println("Error in reading URL: " + url.toString); }     textOfTheURL = bufferOfTheURL.toString.trim; }

/**   * find the html BASE HREF command. * This command points to the string which is the base server for the url, every link * in the URL should have this string prepended to it (appended at the start.). * This method sets the currentToken member to the string of the base href. **/  private int getBaseHREF(int positionOfToken) {     positionOfToken = lowerCaseTextOfTheURL.indexOf("base href",positionOfToken);

if (positionOfToken == -1) return(-1);

positionOfToken = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken);

if (positionOfToken == -1) return(-1);

int returnValue = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken + 1);

if (returnValue < positionOfToken) return(-1);

currentToken = textOfTheURL.substring(positionOfToken + 1, returnValue); return(returnValue); }

/**   * find the html A HREF command. * This command indicates a link and this method finds its position in the file * and sets the currentToken member to the string of the current link. **/  private int getHREF(int positionOfToken) {     positionOfToken = lowerCaseTextOfTheURL.indexOf("a href",positionOfToken); if (positionOfToken == -1) return(-1); positionOfToken = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken);     if (positionOfToken == -1) return(-1);      int returnValue = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken + 1); if (returnValue < positionOfToken) return(-1); currentToken = textOfTheURL.substring(positionOfToken + 1, returnValue); return(returnValue); }

/**   * findLinksInURL looks for all the links in textOfTheURL * and adds every link to the listOfLinks. * This method look for all the tokens in the HTML file and * filters out all those that do not contain an HREF command * (an HTML link). * BTW I usually use an HTML WYSIWYG editor for HTML authoring * so please forgive me if I make any small mistake (but do notify me). **/  private void findLinksInURL {     String URLString = URLPointer.toString; baseServer = URLString.substring(0,(URLString.lastIndexOf("/"))); // Base server defaults to the server of the URL. // If there a BASE HREF command then the baseServer changes // to reflect that address.

lowerCaseTextOfTheURL = textOfTheURL.toLowerCase;

if (getBaseHREF(0) != -1) // is there a base server for the document

baseServer = currentToken;

if (baseServer.endsWith("/")) baseServer = baseServer.substring(0,baseServer.length - 1); // remove the end.

int positionOfHREF; for (int HTMLIterator = textOfTheURL.indexOf("<",0) ; HTMLIterator != -1;          HTMLIterator = textOfTheURL.indexOf("<",HTMLIterator + 1)) {        try {           currentToken = null; HTMLIterator = getHREF(HTMLIterator); if(HTMLIterator == -1) break; }        catch(java.lang.StringIndexOutOfBoundsException err) {           break; }        try {           addToListOfLinks(currentToken); }        catch(java.lang.StringIndexOutOfBoundsException err) {           System.out.println("Serious error in token... " + currentToken); }     }      ready = true; }

/**   * This method is called by findLinksInURL to add an HREF style link * to the list of the links which will then be used to iterate through the * list of links in this URL. **/  private void addToListOfLinks(String token) {     try {        if (token.startsWith("ftp") || token.startsWith("mailto") ||             token.startsWith("news")) return;

if (!(token.endsWith("htm") || token.endsWith("html"))) return;

if(!token.startsWith("http")) if (!token.startsWith(URLPointer.getProtocol)) if(token.startsWith("/")) token = baseServer.concat(token); else token = baseServer.concat("/" + token);

// an exception might be thrown when creating a new URL. listOfLinks.addElement(new URL(token)); }     catch(MalformedURLException err) {        System.out.println("Encountered a bad link in the destination HTML document: \n" + token); }     catch(java.lang.StringIndexOutOfBoundsException err) {        System.out.println("Error in token... " + token); }  }

private boolean ready = false; // indicates when the URL is loaded and parsed. private String baseServer; // Base server defaults // to the server of the URL. If there a BASE HREF command then the // baseServer changes to reflect that address. private String currentToken; // used to store the current href we are working on. private URL URLPointer; // The URL where we started our search. private int currentURLPointer = 0; // used to iterate through the list of links. private String textOfTheURL; // the full text of the page we are searching. private String lowerCaseTextOfTheURL; // the lowercased the full text of the // page we are searching. private boolean wasStringFound = false; // true if the search string was found. private Vector listOfLinks = new Vector; // the list of links in this URL // (filled by findLinksInURL). } 