kinjouj.github.io

Hadoop (13) - Compression -

2011-04-13T00:00:00+00:00 Java Hadoop

org.apache.hadoop.io.compress.CompressionCodecを利用してGZIP圧縮をやってみる

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.IOUtils;

public class Client {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.setStrings("fs.default.name", "hdfs://localhost:9000");
        conf.setStrings("io.compression.codecs", "org.apache.hadoop.io.compress.GzipCodec");

        FileSystem fs = FileSystem.get(conf);
        Path = path = new Path("hdfs://localhost:9000/user/kinjouj/input/data.txt.gz");

        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(path);

        FSDataInputStream is = null;
        CompressionOutputStream os = null;

        try {
            is = fs.open(new Path("hdfs://localhost:9000/user/kinjouj/input/data.txt"));
            os = codec.createOutputStream(fs.create(path));

            IOUtils.copyBytes(is, os, conf);
        } catch(Exception e) {
            e.printStackTrace();
        } finally {
            if(os != null) {
                os.close();
            }
            if(is != null) {
                is.close();
            }
        }
    }
}