hadoop mapreduce开发实践文件合并(join)
#!/bin/bash HADOOP_CMD="/home/hadoop/app/hadoop/hadoop-2.6.0-cdh5.13.0/bin/hadoop"STREAM_JAR_PATH="/home/hadoop/app/hadoop/hadoop-2.6.0-cdh5.13.0/share/hadoop/tools/lib/hadoop-streaming-2.6.0-cdh5.13.0.jar"
INPUT_FILE_PATH_A="/input/join/a.txt"
INPUT_FILE_PATH_B="/input/join/b.txt"
OUTPUT_FILE_PATH_A="/output/join/a"
OUTPUT_FILE_PATH_B="/output/join/b"
OUTPUT_FILE_JOIN_PATH="/output/join/abjoin"
$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_FILE_PATH_A
$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_FILE_PATH_B
$HADOOP_CMD fs -rmr -skipTrash $OUTPUT_FILE_JOIN_PATH
# step1: map a
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $INPUT_FILE_PATH_A \
-output $OUTPUT_FILE_PATH_A \
-jobconf "mapred.job.name=joinfinemapA" \
-mapper "python mapperA.py" \
-file "./mapperA.py"
# step2: map b
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $INPUT_FILE_PATH_B \
-output $OUTPUT_FILE_PATH_B \
-jobconf "mapred.job.name=joinfinemapB" \
-mapper "python mapperB.py" \
-file "./mapperB.py"
# step3: join
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $OUTPUT_FILE_PATH_A,$OUTPUT_FILE_PATH_B \
-output $OUTPUT_FILE_JOIN_PATH \
-mapper "python mapperjoin.py" \
-reducer "python reducerjoin.py" \
-jobconf "mapred.job.name=joinfinemapAB" \
-jobconf "stream.num.map.output.key.fields=2" \
-jobconf "num.key.fields.for.partition=1" \
-file "./reducerjoin.py" \
-file "./mapperjoin.py"
页:
[1]