| | 1 | |
| | 2 | = hadoop streaming + perl = |
| | 3 | |
| | 4 | 僅看到,未測 |
| | 5 | |
| | 6 | 注意 |
| | 7 | 目前 streaming 對 linux pipe #也就是 cat |wc -l 這樣的管道 不支持,但不妨礙我們使用perl,python 行式命令!! |
| | 8 | 原話是 : |
| | 9 | |
| | 10 | Can I use UNIX pipes? For example, will -mapper "cut -f1 | sed s/foo/bar/g" work? |
| | 11 | Currently this does not work and gives an "java.io.IOException: Broken pipe" error. |
| | 12 | This is probably a bug that needs to be investigated. |
| | 13 | 但如果你是強烈的 linux shell pipe 發燒友 ! 參考下面 |
| | 14 | |
| | 15 | {{{ |
| | 16 | $> perl -e 'open( my $fh, "grep -v null tt |sed -n 1,5p |");while ( <$fh> ) {print;} ' |
| | 17 | #不過我沒測試通過 !! |
| | 18 | }}} |
| | 19 | 環境 :hadoop-0.18.3 |
| | 20 | |
| | 21 | {{{ |
| | 22 | $> find . -type f -name "*streaming*.jar" |
| | 23 | ./contrib/streaming/hadoop-0.18.3-streaming.jar |
| | 24 | }}} |
| | 25 | |
| | 26 | 測試數據: |
| | 27 | |
| | 28 | {{{ |
| | 29 | $ head tt |
| | 30 | null false 3702 208100 |
| | 31 | 6005100 false 70 13220 |
| | 32 | 6005127 false 24 4640 |
| | 33 | 6005160 false 25 4820 |
| | 34 | 6005161 false 20 3620 |
| | 35 | 6005164 false 14 1280 |
| | 36 | 6005165 false 37 7080 |
| | 37 | 6005168 false 104 20140 |
| | 38 | 6005169 false 35 6680 |
| | 39 | 6005240 false 169 32140 |
| | 40 | ...... |
| | 41 | }}} |
| | 42 | |
| | 43 | 運行: |
| | 44 | {{{ |
| | 45 | #!perl |
| | 46 | |
| | 47 | c1=" perl -ne 'if(/.*\t(.*)/){\$sum+=\$1;}END{print \"\$sum\";}' " |
| | 48 | # 注意 這裡 $ 要寫成 \$ " 寫成 \" |
| | 49 | |
| | 50 | echo $c1; # 打印輸出 perl -ne 'if(/.*"t(.*)/){$sum+=$1;}END{print $sum;}' |
| | 51 | hadoop jar hadoop-0.18.3-streaming.jar |
| | 52 | -input file:///data/hadoop/lky/jar/tt |
| | 53 | -mapper "/bin/cat" |
| | 54 | -reducer "$c1" |
| | 55 | -output file:///tmp/lky/streamingx8 |
| | 56 | }}} |
| | 57 | |
| | 58 | 結果: |
| | 59 | {{{ |
| | 60 | cat /tmp/lky/streamingx8/* |
| | 61 | 1166480 |
| | 62 | }}} |
| | 63 | |
| | 64 | 本地運行輸出: |
| | 65 | |
| | 66 | {{{ |
| | 67 | perl -ne 'if(/.*"t(.*)/){$sum+=$1;}END{print $sum;}' < tt |
| | 68 | 1166480 |
| | 69 | }}} |
| | 70 | |
| | 71 | 結果正確!!!! |
| | 72 | |
| | 73 | |
| | 74 | 命令自帶文檔: |
| | 75 | {{{ |
| | 76 | -bash-3.00$ hadoop jar hadoop-0.18.3-streaming.jar -info |
| | 77 | 09/09/25 14:50:12 ERROR streaming.StreamJob: Missing required option -input |
| | 78 | Usage: $HADOOP_HOME/bin/hadoop [--config dir] jar \ |
| | 79 | $HADOOP_HOME/hadoop-streaming.jar [options] |
| | 80 | Options: |
| | 81 | -input <path> DFS input file(s) for the Map step |
| | 82 | -output <path> DFS output directory for the Reduce step |
| | 83 | -mapper <cmd|JavaClassName> The streaming command to run |
| | 84 | -combiner <JavaClassName> Combiner has to be a Java class |
| | 85 | -reducer <cmd|JavaClassName> The streaming command to run |
| | 86 | -file <file> File/dir to be shipped in the Job jar file |
| | 87 | -dfs <h:p>|local Optional. Override DFS configuration |
| | 88 | -jt <h:p>|local Optional. Override JobTracker configuration |
| | 89 | -additionalconfspec specfile Optional. |
| | 90 | -inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional. |
| | 91 | -outputformat TextOutputFormat(default)|JavaClassName Optional. |
| | 92 | -partitioner JavaClassName Optional. |
| | 93 | -numReduceTasks <num> Optional. |
| | 94 | -inputreader <spec> Optional. |
| | 95 | -jobconf <n>=<v> Optional. Add or override a JobConf property |
| | 96 | -cmdenv <n>=<v> Optional. Pass env.var to streaming commands |
| | 97 | -mapdebug <path> Optional. To run this script when a map task fails |
| | 98 | -reducedebug <path> Optional. To run this script when a reduce task fails |
| | 99 | -cacheFile fileNameURI |
| | 100 | -cacheArchive fileNameURI |
| | 101 | -verbose |
| | 102 | }}} |
| | 103 | |
| | 104 | 參考: |