View Single Post
Old Nov 21st, 2007, 9:48 PM   #6
null_ptr0
12 years old
 
Join Date: Nov 2007
Posts: 94
Rep Power: 1 null_ptr0 is on a distinguished road
Re: Network Programming Help.

java Syntax (Toggle Plain Text)
  1. import java.net.URL;
  2. import java.net.URLConnection;
  3. import java.net.HttpURLConnection;
  4. import java.util.Vector;
  5. import java.util.regex.Pattern;
  6. import java.util.regex.Matcher;
  7.  
  8. class URLCrawler {
  9. public static void main(String[] argv) {
  10. if(argv.length != 1)
  11. System.out.println("Arguments: <address (String)>");
  12. else
  13. checkAddresses(parseAddressese(downloadSource(address))));
  14. }
  15.  
  16. private String downloadSource(String url) {
  17. byte[] read;
  18. try {
  19. URL url = new URL(address);
  20. URLConnection urlc = url.openConnection();
  21. InputStream is = urlc.getInputStream();
  22. read = new byte[urlc.getContentLength()];
  23. is.read(read);
  24. is.close();
  25. } catch(IOException ioex) {
  26. ioex.printStackTrace();
  27. System.exit(1);
  28. }
  29. return new String(read);
  30. }
  31.  
  32. private String[] parseAddresses(String html) {
  33. String regex = "a\\s+[^>]*?class=l\\s+[^>]*?href\\s?=[\\s'\"]+(.*?)['\"]+.*?>[^<]*</a>";
  34. Pattern p = Pattern.compile(regex);
  35. Matcher m = p.matcher(html);
  36. Vector addresses = new Vector<String>();
  37. while(m.find())
  38. addresses.addElement(m.group());
  39. addresses.trimToSize();
  40. return addresses.toArray(new String[0]);
  41. }
  42.  
  43. private void checkAddresses(String[] urls) {
  44. int i = 0;
  45. for(String url : urls)
  46. i = (isBroken(url) ? i + 1 : i);
  47. System.console().format("%s urls extracted were broken and %s were in tact, out of %s urls", i, urls.length - i, urls.length);
  48. }
  49.  
  50. private boolean isBroken(String url) {
  51. try {
  52. URL u = new URL(url);
  53. HttpURLConnection huc = (HttpURLConnection) u;
  54. return (!huc.getResponseCode() == 200) ? true : false;
  55. } catch(IOException ioex) {
  56. ioex.printStackTrace();
  57. }
  58. return true;
  59. }
  60. }
That's what I programmed in 5 minutes.
null_ptr0 is offline   Reply With Quote