MahoutのLocalitySensitiveHashSearch
通称で言うとLSH。んまぁmixiさんの「LSH (Locality Sensitive Hashing) を用いた類似インスタンスペアの抽出」辺りが有名?ですかね
http://hillbig.cocolog-nifty.com/do/2009/02/lsh-spectral-ha.html にはLSHとはなんぞやと以下の本とかでも触れられているらしい(ていうか筆者かと)
でMahoutでそのLSHサポートがあるっぽいのでちょっと使ってみたのでメモ
import java.util.ArrayList;
import java.util.List;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.neighborhood.LocalitySensitiveHashSearch;
import org.apache.mahout.math.random.WeightedThing;
public class Client {
public static void main(String[] args) throws Exception {
double[][] values = {
{ 1, 2 },
{ 2, 4 },
{ 1, 3 },
{ 1, 5 },
{ 1, 4 },
{ 2, 3 },
{ 3, 2 },
{ 4, 1 },
{ 3, 1 }
};
int length = values.length;
List<vector> data = new ArrayList<vector>(length);
for (int i = 0; i < length; i++) {
double[] value = values[i];
Vector vec = new RandomAccessSparseVector(value.length);
vec.assign(value);
data.add(new NamedVector(vec, String.valueOf(i + 1)));
}
EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
LocalitySensitiveHashSearch lsh = new LocalitySensitiveHashSearch(
measure,
3 // max results
);
lsh.addAll(data);
Vector vec = new RandomAccessSparseVector(2);
vec.assign(new double[] { 3, 1 });
List<weightedThing<vector>> results = lsh.search(vec, 1);
for(WeightedThing<vector> result : results) {
NamedVector v = (NamedVector)result.getValue();
System.out.println(v.asFormatString());
}
}
}