色粉盒撒娇 发表于 2018-10-28 14:03:20

hadoop mapreduce开发实践文件合并(join)

#!/bin/bash  HADOOP_CMD="/home/hadoop/app/hadoop/hadoop-2.6.0-cdh5.13.0/bin/hadoop"
  STREAM_JAR_PATH="/home/hadoop/app/hadoop/hadoop-2.6.0-cdh5.13.0/share/hadoop/tools/lib/hadoop-streaming-2.6.0-cdh5.13.0.jar"
  INPUT_FILE_PATH_A="/input/join/a.txt"
  INPUT_FILE_PATH_B="/input/join/b.txt"
  OUTPUT_FILE_PATH_A="/output/join/a"
  OUTPUT_FILE_PATH_B="/output/join/b"
  OUTPUT_FILE_JOIN_PATH="/output/join/abjoin"
  $HADOOP_CMD fs -rmr -skipTrash $OUTPUT_FILE_PATH_A
  $HADOOP_CMD fs -rmr -skipTrash $OUTPUT_FILE_PATH_B
  $HADOOP_CMD fs -rmr -skipTrash $OUTPUT_FILE_JOIN_PATH
  # step1: map a
  $HADOOP_CMD jar $STREAM_JAR_PATH \
  -input $INPUT_FILE_PATH_A \
  -output $OUTPUT_FILE_PATH_A \
  -jobconf "mapred.job.name=joinfinemapA" \
  -mapper "python mapperA.py" \
  -file "./mapperA.py"
  # step2: map b
  $HADOOP_CMD jar $STREAM_JAR_PATH \
  -input $INPUT_FILE_PATH_B \
  -output $OUTPUT_FILE_PATH_B \
  -jobconf "mapred.job.name=joinfinemapB" \
  -mapper "python mapperB.py" \
  -file "./mapperB.py"
  # step3: join
  $HADOOP_CMD jar $STREAM_JAR_PATH \
  -input $OUTPUT_FILE_PATH_A,$OUTPUT_FILE_PATH_B \
  -output $OUTPUT_FILE_JOIN_PATH \
  -mapper "python mapperjoin.py" \
  -reducer "python reducerjoin.py" \
  -jobconf "mapred.job.name=joinfinemapAB" \
  -jobconf "stream.num.map.output.key.fields=2" \
  -jobconf "num.key.fields.for.partition=1" \
  -file "./reducerjoin.py" \
  -file "./mapperjoin.py"

页: [1]
查看完整版本: hadoop mapreduce开发实践文件合并(join)