Hadoop Streamingを使ってPerlでMapReduceをやってみる

mapper.plを作成

#!/usr/bin/perl

use strict;
use warnings;

while(<>) {
    chomp;
    my @words = split /\s/, $_;

    for my $word (@words) {
        print "$word\t1\n";
    }
}

reduce.plを作成

#!/usr/bin/perl

use strict;
use warnings;

my $data;

while (<>) {
    chomp;

    my ($key, $value) = split /\t/, $_;

    next unless length($key) > 0;

    $data->{$key} = 0 unless defined $data->{$key};
    $data->{$key}++;
}

while (my ($key, $value) = each %{ $data }) {
    print "$key\t$value\n";
}

実行

hadoop jar \
    /path/to/hadoop/contrib/streaming/hadoop-streaming.jar \
    -input input \
    -output output \
    -mapper mapper.pl
    -reducer reduce.pl
    -file mapper.pl
    -file reduce.pl