ShareThis

Sunday, October 30, 2011

[JAVA] Grab all hyperlinks from a website - Source Code


import java.io.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;

class GetLinks {
public static void main(String[] args) {
EditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();

// The Document class does not yet
// handle charset's properly.
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
try {

// Create a reader on the HTML content.
Reader rd = getReader(args[0]);

// Parse the HTML.
kit.read(rd, doc, 0);

// Iterate through the elements
// of the HTML document.
ElementIterator it = new ElementIterator(doc);
javax.swing.text.Element elem;
while ((elem = it.next()) != null) {
MutableAttributeSet s = (MutableAttributeSet) elem
.getAttributes().getAttribute(HTML.Tag.A);
//System.out.println(s);
if (s != null) {
System.out.println(s.getAttribute(HTML.Attribute.HREF));
}
}
} catch (Exception e) {
e.printStackTrace();
}
System.exit(1);
}

// Returns a reader on the HTML data. If 'uri' begins
// with "http:", it's treated as a URL; otherwise,
// it's assumed to be a local filename.
static Reader getReader(String uri) throws IOException {
if (uri.startsWith("http:")) {

// Retrieve from Internet.
URLConnection conn = new URL(uri).openConnection();
return new InputStreamReader(conn.getInputStream());
} else {

// Retrieve from file.
return new FileReader(uri);
}
}
}


0 comments:

Post a Comment