hadoop自定义数据类型

haloi 发表于 2017-12-17 06:53:48

import java.io.File;　　

import java.io.IOException;　　

import java.net.URI;　　

import java.net.URISyntaxException;　　

　　
import org.apache.hadoop.conf.Configuration;
　　
import org.apache.hadoop.fs.FileSystem;
　　
import org.apache.hadoop.fs.Path;
　　
import org.apache.hadoop.io.LongWritable;
　　
import org.apache.hadoop.io.Text;
　　
import org.apache.hadoop.mapreduce.Job;
　　
import org.apache.hadoop.mapreduce.Mapper;
　　
import org.apache.hadoop.mapreduce.Reducer;
　　
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
　　
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
　　

　　
public>　　

　　
static final String INPUT_PATH = "F:/Tutorial/Hadoop/TestData/data/HTTP_20130313143750.dat";
　　
static final String OUTPUT_PATH = "hdfs://masters:9000/user/hadoop/output/TestPhone";

　　
public static void main(String[] args) throws IOException,>　　

　　
      //添加以下的代码，就可以联通，不知道咋回事
　　
      String path = new File(".").getCanonicalPath();
　　
      System.getProperties().put("hadoop.home.dir", path);
　　
      new File("./bin").mkdirs();
　　
      new File("./bin/winutils.exe").createNewFile();
　　

　　
      Configuration conf = new Configuration();
　　
      Path outpath = new Path(OUTPUT_PATH);
　　

　　
      //检测输出路径是否存在，如果存在就删除，否则会报错
　　
      FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
　　
      if(fileSystem.exists(outpath)){
　　
         fileSystem.delete(outpath, true);
　　
      }
　　

　　
      Job job = new Job(conf, "SimLines");
　　

　　
      FileInputFormat.setInputPaths(job, INPUT_PATH);
　　
      FileOutputFormat.setOutputPath(job, outpath);
　　

　　
      job.setMapperClass(MyMapper.class);
　　
      job.setReducerClass(MyReducer.class);
　　
      job.setOutputKeyClass(Text.class);
　　
      job.setOutputValueClass(SimLines.class);
　　
      job.waitForCompletion(true);
　　
}
　　

　　
//输入，map，即拆分过程

　　
static>　　

　　
      protected void map(LongWritable k1, Text v1, Context context)throws IOException, InterruptedException{
　　
         String[] splits = v1.toString().split("\t");//按照空格拆分
　　
         Text k2 = new Text(splits);
　　
         SimLines simLines = new SimLines(splits, splits);
　　
         context.write(k2, simLines);
　　
      }
　　
}
　　

　　
//输出，reduce，汇总过程

　　
static>　　
      protected void reduce(
　　
            Text k2, //输出的内容，即value
　　
            Iterable<SimLines> v2s, //是一个longwritable类型的数组，所以用了Iterable这个迭代器，且元素为v2s
　　
            org.apache.hadoop.mapreduce.Reducer<Text, SimLines, Text, SimLines>.Context context)
　　
            //这里一定设置好，不然输出会变成单个单词，从而没有统计数量
　　
            throws IOException, InterruptedException {
　　
         //列表求和初始为0
　　
         long upPackNum = 0L, downPackNum = 0L;
　　
         for(SimLines simLines:v2s){
　　
            upPackNum += simLines.upPackNum;
　　
            downPackNum += simLines.downPackNum;
　　
         }
　　
         SimLines v3 = new SimLines(upPackNum + "", downPackNum + "");
　　
         context.write(k2, v3);
　　
      }
　　
}
　　
}

页: [1]

运维网's Archiver

hadoop自定义数据类型