hadoop worldcount小程序

mofdan 发表于 2017-12-17 07:36:35

import java.io.File;　　

import java.io.IOException;　　

import java.net.URI;　　

import java.net.URISyntaxException;　　

　　
import org.apache.hadoop.conf.Configuration;
　　
import org.apache.hadoop.fs.FileSystem;
　　
import org.apache.hadoop.fs.Path;
　　
import org.apache.hadoop.io.LongWritable;
　　
import org.apache.hadoop.io.Text;
　　
import org.apache.hadoop.mapreduce.Job;
　　
import org.apache.hadoop.mapreduce.Mapper;
　　
import org.apache.hadoop.mapreduce.Reducer;
　　
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
　　
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
　　

　　
public>　　

　　
static final String INPUT_PATH = "hdfs://masters:9000/user/hadoop/input";
　　
static final String OUTPUT_PATH = "hdfs://masters:9000/user/hadoop/output";

　　
public static void main(String[] args) throws IOException,>　　

　　
      //添加以下的代码，就可以联通，不知道咋回事
　　
      String path = new File(".").getCanonicalPath();
　　
      System.getProperties().put("hadoop.home.dir", path);
　　
      new File("./bin").mkdirs();
　　
      new File("./bin/winutils.exe").createNewFile();
　　

　　
      Configuration conf = new Configuration();
　　
      Path outpath = new Path(OUTPUT_PATH);
　　

　　
      Job job = new Job(conf, "WorldCount");
　　

　　
      FileInputFormat.setInputPaths(job, INPUT_PATH);
　　
      FileOutputFormat.setOutputPath(job, outpath);
　　

　　
      //检测输出路径是否存在，如果存在就删除，否则会报错
　　
      FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
　　
      if(fileSystem.exists(outpath)){
　　
         fileSystem.delete(outpath, true);
　　
      }
　　

　　
      job.setMapperClass(MyMapper.class);
　　
      job.setReducerClass(MyReducer.class);
　　
      job.setOutputKeyClass(Text.class);
　　
      job.setOutputValueClass(LongWritable.class);
　　
      job.waitForCompletion(true);
　　
}
　　

　　
//输入，map，即拆分过程

　　
static>　　

　　
      /*
　　
      * 输入为（key,value）输出为（value,count数量）
　　
      * 所以LongWritable, Text, Text, LongWritable分别代表 key(行号) value value count
　　
      * 其中LongWritable和Text是hadoop定义的类型，分别代表long和string两种类型
　　
      * */
　　
      protected void map(LongWritable k1, Text v1, Context context)throws IOException, InterruptedException{
　　
         String[] splits = v1.toString().split(" ");//按照空格拆分
　　
         for(String str: splits){
　　
            System.out.println("---" + str);
　　
            context.write(new Text(str), new LongWritable(1));//拆分出来的形式为（“单词”，出现次数（这里默认为1））
　　
         }
　　
      }
　　
}
　　

　　
//输出，reduce，汇总过程

　　
static>　　
      protected void reduce(
　　
            Text k2, //输出的内容，即value
　　
            Iterable<LongWritable> v2s, //是一个longwritable类型的数组，所以用了Iterable这个迭代器，且元素为v2s
　　
            org.apache.hadoop.mapreduce.Reducer<Text, LongWritable, Text, LongWritable>.Context context)
　　
            //这里一定设置好，不然输出会变成单个单词，从而没有统计数量
　　
            throws IOException, InterruptedException {
　　
         //列表求和初始为0
　　
         long times = 0L;
　　
         for(LongWritable count:v2s){
　　
            times += count.get();
　　
         }
　　
         context.write(k2, new LongWritable(times));
　　
      }
　　
}
　　
}

页: [1]

运维网's Archiver

hadoop worldcount小程序