import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.util.Vector;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
class URLCrawler {
public static void main(String[] argv) {
if(argv.length != 1)
System.out.println("Arguments: <address (String)>");
else
checkAddresses(parseAddressese(downloadSource(address))));
}
private String downloadSource(String url) {
byte[] read;
try {
URL url = new URL(address);
URLConnection urlc = url.openConnection();
InputStream is = urlc.getInputStream();
read = new byte[urlc.getContentLength()];
is.read(read);
is.close();
} catch(IOException ioex) {
ioex.printStackTrace();
System.exit(1);
}
return new String(read);
}
private String[] parseAddresses(String html) {
String regex = "a\\s+[^>]*?class=l\\s+[^>]*?href\\s?=[\\s'\"]+(.*?)['\"]+.*?>[^<]*</a>";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(html);
Vector addresses = new Vector<String>();
while(m.find())
addresses.addElement(m.group());
addresses.trimToSize();
return addresses.toArray(new String[0]);
}
private void checkAddresses(String[] urls) {
int i = 0;
for(String url : urls)
i = (isBroken(url) ? i + 1 : i);
System.console().format("%s urls extracted were broken and %s were in tact, out of %s urls", i, urls.length - i, urls.length);
}
private boolean isBroken(String url) {
try {
URL u = new URL(url);
HttpURLConnection huc = (HttpURLConnection) u;
return (!huc.getResponseCode() == 200) ? true : false;
} catch(IOException ioex) {
ioex.printStackTrace();
}
return true;
}
}
That's what I programmed in 5 minutes.