job on hadoop

//http://distributed-agility.blogspot.com/2010/01/hadoop-0201-example-inverted-line-index.html

//https://portal.futuregrid.org/manual/hadoop-wordcount

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
* LineIndexer Creates an inverted index over all the words in a document corpus, mapping each observed word to a list
* of filename@offset locations where it occurs.
*/
public class LineIndexer extends Configured implements Tool {

// where to put the data in hdfs when we're done
private static final String OUTPUT_PATH = "output";

// where to read the data from.
private static final String INPUT_PATH = "input";

public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new LineIndexer(), args);
System.exit(res);
}

public int run(String[] args) throws Exception {

Configuration conf = getConf();
Job job = new Job(conf, "Line Indexer 1");

job.setJarByClass(LineIndexer.class);
job.setMapperClass(LineIndexMapper.class);
job.setReducerClass(LineIndexReducer.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));

return job.waitForCompletion(true) ? 0 : 1;
}
}

After updating, make sure to run generate a new jar, remove anything under the directory "output" (since the program does not clean that up), and execute the new version.

training@training-vm:~/git/exercises/shakespeare$ ant jar
Buildfile: build.xml

compile:
[javac] Compiling 4 source files to /home/training/git/exercises/shakespeare/bin

jar:
[jar] Building jar: /home/training/git/exercises/shakespeare/indexer.jar

BUILD SUCCESSFUL
Total time: 1 second

I have added 2 ASCII books in the input directory: the works from Leonardo Da Vinci and the first volume of the book "The outline of science".

training@training-vm:~/git/exercises/shakespeare$ hadoop fs -ls input
Found 3 items
-rw-r--r-- 1 training supergroup 5342761 2009-12-30 11:57 /user/training/input/all-shakespeare
-rw-r--r-- 1 training supergroup 1427769 2010-01-04 17:42 /user/training/input/leornardo-davinci-all.txt
-rw-r--r-- 1 training supergroup 674762 2010-01-04 17:42 /user/training/input/the-outline-of-science-vol1.txt

The execution and output of running this example is shown as follows.

training@training-vm:~/git/exercises/shakespeare$ hadoop jar indexer.jar index.LineIndexer
10/01/04 21:11:55 INFO input.FileInputFormat: Total input paths to process : 3
10/01/04 21:11:56 INFO mapred.JobClient: Running job: job_200912301017_0017
10/01/04 21:11:57 INFO mapred.JobClient: map 0% reduce 0%

猜你喜欢