Map:
Vector featureVector = features.get();
if (featureVector.size() < minVectorSize) {
return;
}
// Initialize the MinHash values to highest
for (int i = 0; i < numHashFunctions; i++) {
minHashValues[i] = Integer.MAX_VALUE;
}
for (int i = 0; i < numHashFunctions; i++) {
for (Vector.Element ele : featureVector.nonZeroes()) {
int value = hashValue ? (int) ele.get() : ele.index();
bytesToHash[0] = (byte) (value >> 24);
bytesToHash[1] = (byte) (value >> 16);
bytesToHash[2] = (byte) (value >> 8);
bytesToHash[3] = (byte) value;
int hashIndex = hashFunction[i].hash(bytesToHash);
//if our new hash value is less than the old one, replace the old one
if (minHashValues[i] > hashIndex) {
minHashValues[i] = hashIndex;
}
}
}
// output the cluster information
for (int i = 0; i < numHashFunctions; i++) {
StringBuilder clusterIdBuilder = new StringBuilder();
for (int j = 0; j < keyGroups; j++) {
clusterIdBuilder.append(minHashValues[(i + j) % numHashFunctions]).append('-');
}
//remove the last dash
clusterIdBuilder.deleteCharAt(clusterIdBuilder.length() - 1);
cluster.set(clusterIdBuilder.toString());
if (debugOutput) {
vector.set(featureVector);
context.write(cluster, vector);
} else {
context.write(cluster, item);
}
}
protected void reduce(Text cluster, Iterable<Writable> points, Context context)
throws IOException, InterruptedException {
Collection<Writable> pointList = Lists.newArrayList();
for (Writable point : points) {
if (debugOutput) {
Vector pointVector = ((VectorWritable) point).get().clone();
Writable writablePointVector = new VectorWritable(pointVector);
pointList.add(writablePointVector);
} else {
Writable pointText = new Text(point.toString());
pointList.add(pointText);
}
}
if (pointList.size() >= minClusterSize) {
context.getCounter(Clusters.ACCEPTED).increment(1);
for (Writable point : pointList) {
context.write(cluster, point);
}
} else {
context.getCounter(Clusters.DISCARDED).increment(1);
}
}