3 de out. de 2012

Build a simple crawler to download website content from web using java and nio

The web is public and your access to it too. Is very simple to download and storage a web content using java and new io.

See this simple Main class example:

package mypackage;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.text.SimpleDateFormat;
import java.util.Date;

public class MySimpleCrawler {

    /**
     * @param args
     */
    public static void main(String[] args) {
   
        // begin
        final String baseDir = "/my_storaged_blogs/the-simple-work/";
        try {
           
            writeFile("http://www.thesimplework.blogspot.com", baseDir, "index-of-blog.html",1000);
           
            writeFile("http://thesimplework.blogspot.com.br/2012/09/how-to-execute-paralel-process-using.html", baseDir, "my-favorite-post-etc.html",1000);
           
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
       
    }

    private static SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy HH:mm.SSS");
   
    private static void writeFile(final String fromUrl, final String toDir, final String toFile, final int interval) throws IOException, InterruptedException {
       
         Thread.sleep(interval);
       
         System.out.println(sdf.format(new Date()) +" Getting URL= " + fromUrl);
       
         File directory = new File (toDir);
         if (!directory.exists()) {
            directory.mkdirs();
         }
       
         String toDirWithSep = toDir;
       
         if (!toDirWithSep.endsWith("/")) {
             toDirWithSep +="/";
         }
   
         URL website = new URL(fromUrl);
         ReadableByteChannel rbc = Channels.newChannel(website.openStream());
         FileOutputStream fos = new  FileOutputStream(toDirWithSep+toFile);
         fos.getChannel().transferFrom(rbc, 0, 1 << 24);

         System.out.println( sdf.format(new Date()) + " Sucessfully writed "+ toFile + " at dir " +  toDir +"." ) ;
    }

}

Nenhum comentário:

Postar um comentário