Hadoop+Cassandra (3) - Mapper & ReducerでCassandra -

2012-04-08T00:00:00+09:00 Cassandra Hadoop Java

HadoopでCassandraを使ってみるネタシリーズはこれで終わり。1回目と2回目をくっつけて、Cassandraから読み込みCassandraにプッシュしてみる。で、MapperとReducerは1回目で書いたSampleCassandraMapperと2回目で書いたSampleCassandraReducerをそのまま使います。なのでHadoopジョブを投げる側だけを作れば良い

package sample;

import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.List;

import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import sample.mapreduce.SampleCassandraMapper;
import sample.mapreduce.SampleCassandraReducer;

public class HadoopClient3 extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(), new HadoopClient3(), args);
    }

    @Override
    public int run(String[] arg0) throws Exception {
        JobConf conf = new JobConf();
        conf.setJar("sample.jar");

        ConfigHelper.setInitialAddress(conf, "127.0.0.1");
        ConfigHelper.setRpcPort(conf, "9160");
        ConfigHelper.setInputColumnFamily(conf, "Keyspace1", "Sample");
        ConfigHelper.setOutputColumnFamily(conf, "Keyspace1", "Outputs");
        ConfigHelper.setPartitioner(conf, "org.apache.cassandra.dht.RandomPartitioner");

        SlicePredicate predicate = new SlicePredicate().setColumn_names(Arrays.asList(ByteBufferUtil.bytes("name")));
        ConfigHelper.setInputSlicePredicate(conf, predicate);

        Job job = new Job(conf);
        job.setMapperClass(SampleCassandraMapper.class);
        job.setReducerClass(SampleCassandraReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setInputFormatClass(ColumnFamilyInputFormat.class);
        job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

        job.setOutputKeyClass(ByteBuffer.class);
        job.setOutputValueClass(List.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }
}

Apache tikaを使ってみる Hadoop+Cassandra (2) - ReducerでCassandra -