Grinding Java - Searching the WWW in Java:grinding3.java
Appearance
import java.awt.*; import java.awt.event.*; import java.net.*; import java.lang.*; import java.io.*; import java.util.Vector; /** * the URLParaser class loads the URL to memory and searches through it for the * search string and the links in it. * It allows us to iterate through the links in the URL. **/ public class URLParaser { /** * the constructor takes a parameter for a url to parase. **/ public URLParaser(URL url) { loadURL(url); findLinksInURL(); while (!ready); } /** * get the next url in the list of links from the url we are searching. **/ synchronized public URL getNextURL() { while (!ready); if(listOfLinks.size() == currentURLPointer) return (null); URL returnValue = (URL)listOfLinks.elementAt(currentURLPointer); currentURLPointer++; return(returnValue); } /** * returns true if this string is in the url. **/ public boolean isStringInWWWPage(String s) { int index = textOfTheURL.toLowerCase().indexOf(s.toLowerCase()); return(index >= 0); } /** * reads a url into memory and sores it as one string. * The limit on the size of a string is actually quite high. **/ private void loadURL(URL url) { URLPointer = url; StringBuffer bufferOfTheURL = new StringBuffer(""); try { URLConnection URLConnection = url.openConnection(); InputStream URLStream = URLConnection.getInputStream(); byte[] buffer = new byte [1000]; char[] charBuffer = new char [1000]; int readResult = 0; while(readResult != -1) { readResult = URLStream.read(buffer); for (int counter = 0; counter < charBuffer.length; counter++) charBuffer[counter] = (char)buffer[counter]; bufferOfTheURL.append(charBuffer); } } catch (IOException e) { System.out.println("Error in reading URL: " + url.toString()); } textOfTheURL = bufferOfTheURL.toString().trim(); } /** * find the html BASE HREF command. * This command points to the string which is the base server for the url, every link * in the URL should have this string prepended to it (appended at the start.). * This method sets the currentToken member to the string of the base href. **/ private int getBaseHREF(int positionOfToken) { positionOfToken = lowerCaseTextOfTheURL.indexOf("base href",positionOfToken); if (positionOfToken == -1) return(-1); positionOfToken = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken); if (positionOfToken == -1) return(-1); int returnValue = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken + 1); if (returnValue < positionOfToken) return(-1); currentToken = textOfTheURL.substring(positionOfToken + 1, returnValue); return(returnValue); } /** * find the html A HREF command. * This command indicates a link and this method finds its position in the file * and sets the currentToken member to the string of the current link. **/ private int getHREF(int positionOfToken) { positionOfToken = lowerCaseTextOfTheURL.indexOf("a href",positionOfToken); if (positionOfToken == -1) return(-1); positionOfToken = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken); if (positionOfToken == -1) return(-1); int returnValue = lowerCaseTextOfTheURL.indexOf("\"",positionOfToken + 1); if (returnValue < positionOfToken) return(-1); currentToken = textOfTheURL.substring(positionOfToken + 1, returnValue); return(returnValue); } /** * findLinksInURL looks for all the links in textOfTheURL * and adds every link to the listOfLinks. * This method look for all the tokens in the HTML file and * filters out all those that do not contain an HREF command * (an HTML link). * BTW I usually use an HTML WYSIWYG editor for HTML authoring * so please forgive me if I make any small mistake (but do notify me). **/ private void findLinksInURL() { String URLString = URLPointer.toString(); baseServer = URLString.substring(0,(URLString.lastIndexOf("/"))); // Base server defaults to the server of the URL. // If there a BASE HREF command then the baseServer changes // to reflect that address. lowerCaseTextOfTheURL = textOfTheURL.toLowerCase(); if (getBaseHREF(0) != -1) // is there a base server for the document baseServer = currentToken; if (baseServer.endsWith("/")) baseServer = baseServer.substring(0,baseServer.length() - 1); // remove the end. int positionOfHREF; for (int HTMLIterator = textOfTheURL.indexOf("<",0) ; HTMLIterator != -1; HTMLIterator = textOfTheURL.indexOf("<",HTMLIterator + 1)) { try { currentToken = null; HTMLIterator = getHREF(HTMLIterator); if(HTMLIterator == -1) break; } catch(java.lang.StringIndexOutOfBoundsException err) { break; } try { addToListOfLinks(currentToken); } catch(java.lang.StringIndexOutOfBoundsException err) { System.out.println("Serious error in token... " + currentToken); } } ready = true; } /** * This method is called by findLinksInURL to add an HREF style link * to the list of the links which will then be used to iterate through the * list of links in this URL. **/ private void addToListOfLinks(String token) { try { if (token.startsWith("ftp") || token.startsWith("mailto") || token.startsWith("news")) return; if (!(token.endsWith("htm") || token.endsWith("html"))) return; if(!token.startsWith("http")) if (!token.startsWith(URLPointer.getProtocol())) if(token.startsWith("/")) token = baseServer.concat(token); else token = baseServer.concat("/" + token); // an exception might be thrown when creating a new URL. listOfLinks.addElement(new URL(token)); } catch(MalformedURLException err) { System.out.println("Encountered a bad link in the destination HTML document: \n" + token); } catch(java.lang.StringIndexOutOfBoundsException err) { System.out.println("Error in token... " + token); } } private boolean ready = false; // indicates when the URL is loaded and parsed. private String baseServer; // Base server defaults // to the server of the URL. If there a BASE HREF command then the // baseServer changes to reflect that address. private String currentToken; // used to store the current href we are working on. private URL URLPointer; // The URL where we started our search. private int currentURLPointer = 0; // used to iterate through the list of links. private String textOfTheURL; // the full text of the page we are searching. private String lowerCaseTextOfTheURL; // the lowercased the full text of the // page we are searching. private boolean wasStringFound = false; // true if the search string was found. private Vector listOfLinks = new Vector(); // the list of links in this URL // (filled by findLinksInURL). }