The web is public and your access to it too. Is very simple to download and storage a web content using java and new io.
See this simple Main class example:
package mypackage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.text.SimpleDateFormat;
import java.util.Date;
public class MySimpleCrawler {
/**
* @param args
*/
public static void main(String[] args) {
// begin
final String baseDir = "/my_storaged_blogs/the-simple-work/";
try {
writeFile("http://www.thesimplework.blogspot.com", baseDir, "index-of-blog.html",1000);
writeFile("http://thesimplework.blogspot.com.br/2012/09/how-to-execute-paralel-process-using.html", baseDir, "my-favorite-post-etc.html",1000);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private static SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy HH:mm.SSS");
private static void writeFile(final String fromUrl, final String toDir, final String toFile, final int interval) throws IOException, InterruptedException {
Thread.sleep(interval);
System.out.println(sdf.format(new Date()) +" Getting URL= " + fromUrl);
File directory = new File (toDir);
if (!directory.exists()) {
directory.mkdirs();
}
String toDirWithSep = toDir;
if (!toDirWithSep.endsWith("/")) {
toDirWithSep +="/";
}
URL website = new URL(fromUrl);
ReadableByteChannel rbc = Channels.newChannel(website.openStream());
FileOutputStream fos = new FileOutputStream(toDirWithSep+toFile);
fos.getChannel().transferFrom(rbc, 0, 1 << 24);
System.out.println( sdf.format(new Date()) + " Sucessfully writed "+ toFile + " at dir " + toDir +"." ) ;
}
}
Nenhum comentário:
Postar um comentário