注意点:
1、输入数据是个目录
2、在map中需要获取key所属文档路径或者名称
点击(此处)折叠或打开
-
import org.apache.hadoop.fs.Path;
-
import org.apache.hadoop.io.IntWritable;
-
import org.apache.hadoop.io.LongWritable;
-
import org.apache.hadoop.io.NullWritable;
-
import org.apache.hadoop.io.Text;
-
import org.apache.hadoop.mapreduce.Job;
-
import org.apache.hadoop.mapreduce.Mapper;
-
import org.apache.hadoop.mapreduce.Reducer;
-
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-
import org.apache.hadoop.util.GenericOptionsParser;
-
-
import java.io.IOException;
-
import java.util.StringTokenizer;
-
-
public class InvertedIndexJob {
-
-
public static class InvertedIndexMapper extends Mapper<LongWritable, Text, Text,IntWritable>{
-
-
@Override
-
public void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
-
-
String data= text.toString();
-
FileSplit inputSplit = (FileSplit) context.getInputSplit();
-
String fileName = inputSplit.getPath().getName();
-
StringTokenizer strToken = new StringTokenizer(data);
-
while (strToken.hasMoreTokens()){
-
context.write(new Text(strToken.nextToken()+":"+fileName), new IntWritable(1));
-
}
-
}
-
}
-
-
public static class InvertedIndexReduce extends Reducer<Text,IntWritable, NullWritable, Text>{
-
@Override
-
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
-
int count=0;
-
for (IntWritable value: values){
-
count++;
-
}
-
context.write(NullWritable.get(), new Text( key.toString() +"\t"+ String.valueOf(count)));
-
}
-
}
-
-
public static void main(String []args){
-
-
Job job = null;
-
try {
-
job = Job.getInstance();
-
job.setJobName("InvertedIndexJob");
-
job.setJarByClass(InvertedIndexJob.class);
-
job.setMapperClass(InvertedIndexMapper.class);
-
job.setMapOutputKeyClass(Text.class);
-
job.setMapOutputValueClass(IntWritable.class);
-
job.setReducerClass(InvertedIndexReduce.class);
-
job.setOutputKeyClass(NullWritable.class);
-
job.setOutputValueClass(Text.class);
-
job.setNumReduceTasks(1);
-
-
String []arrays = new GenericOptionsParser(args).getRemainingArgs();
-
FileInputFormat.setInputPaths(job, new Path(arrays[0]));
-
FileOutputFormat.setOutputPath(job,new Path(arrays[1]));
-
-
System.out.println(job.waitForCompletion(true));
-
-
} catch (IOException e) {
-
e.printStackTrace();
-
} catch (InterruptedException e) {
-
e.printStackTrace();
-
} catch (ClassNotFoundException e) {
-
e.printStackTrace();
-
}
-
-
}
- }