|   | 1 | [[PageOutline]] | 
                  
                          |   | 2 |  | 
                  
                          |   | 3 | ◢ <[wiki:FDC110829/Lab5 實作五]> | <[wiki:FDC110829 回課程大綱]> ▲ | ◣ | 
                  
                          |   | 4 |  | 
                  
                          |   | 5 | = 實作六 Lab6 = | 
                  
                          |   | 6 |  | 
                  
                          |   | 7 | {{{ | 
                  
                          |   | 8 | #!html | 
                  
                          |   | 9 | <div style="text-align: center;"><big style="font-weight: bold;"><big>HDFS、MapReduce 與 Hadoop Streaming 觀念驗證<br/>Running Hadoop Streaming with HDFS and MapReduce</big></big></div> | 
                  
                          |   | 10 | }}} | 
                  
                          |   | 11 |  | 
                  
                          |   | 12 | == 準備輸入資料集 Input Dataset == | 
                  
                          |   | 13 |  | 
                  
                          |   | 14 |  * 首先準備輸入,包含兩部份:(1) 供 Velvet 計算的 *.fa 檔案,這裡為了方便示範起見,採用 test_long.fa 當範本,並複製 99 份不同檔名,當輸入 (2) 供 Mapper 運算的輸入檔案(內含 HDFS 的檔名路徑) | 
                  
                          |   | 15 | {{{ | 
                  
                          |   | 16 | ~$ cp /usr/share/doc/velvet-example/examples/data/test_long.fa.gz . | 
                  
                          |   | 17 | ~$ gunzip test_long.fa.gz | 
                  
                          |   | 18 | ~$ for ((i=1;i<100;i++)); do hadoop fs -put test_long.fa sample-$i.fa; done | 
                  
                          |   | 19 | ~$ for ((i=1;i<20;i++)); do echo /user/$(whoami)/sample-$i.fa; done > sample-01.txt | 
                  
                          |   | 20 | ~$ for ((i=20;i<40;i++)); do echo /user/$(whoami)/sample-$i.fa; done > sample-02.txt | 
                  
                          |   | 21 | ~$ for ((i=40;i<60;i++)); do echo /user/$(whoami)/sample-$i.fa; done > sample-03.txt | 
                  
                          |   | 22 | ~$ for ((i=60;i<80;i++)); do echo /user/$(whoami)/sample-$i.fa; done > sample-04.txt | 
                  
                          |   | 23 | ~$ for ((i=80;i<100;i++)); do echo /user/$(whoami)/sample-$i.fa; done > sample-05.txt | 
                  
                          |   | 24 | ~$ hadoop fs -mkdir lab9_input | 
                  
                          |   | 25 | ~$ hadoop fs -put sample-0* lab9_input | 
                  
                          |   | 26 | }}} | 
                  
                          |   | 27 |  * 檢查輸入檔案 | 
                  
                          |   | 28 | {{{ | 
                  
                          |   | 29 | ~$ hadoop fs -ls | 
                  
                          |   | 30 | ~$ hadoop fs -ls lab9_input | 
                  
                          |   | 31 | }}} | 
                  
                          |   | 32 |  | 
                  
                          |   | 33 | == 觀察 Hadoop Streaming 執行身份與工作目錄 == | 
                  
                          |   | 34 |  | 
                  
                          |   | 35 |  * 撰寫 testmapper.sh | 
                  
                          |   | 36 | {{{ | 
                  
                          |   | 37 | #!sh | 
                  
                          |   | 38 | #!/bin/bash | 
                  
                          |   | 39 |  | 
                  
                          |   | 40 | id="h998" | 
                  
                          |   | 41 | mkdir -p /tmp/$id | 
                  
                          |   | 42 | host=`hostname` | 
                  
                          |   | 43 | pwd=`pwd` | 
                  
                          |   | 44 | uid=`whoami` | 
                  
                          |   | 45 |  | 
                  
                          |   | 46 | while read line; do  | 
                  
                          |   | 47 |   input=$line | 
                  
                          |   | 48 |   filename=`basename $input` | 
                  
                          |   | 49 |   echo "$uid@$host:$pwd> hadoop fs -get $input /tmp/$id/$filename" | 
                  
                          |   | 50 |   echo "$uid@$host:$pwd> velveth output-$filename 17 -fasta -short /tmp/$id/$filename" | 
                  
                          |   | 51 |   echo "$uid@$host:$pwd> hadoop fs -put output-$filename ." | 
                  
                          |   | 52 | done | 
                  
                          |   | 53 | rm -rf /tmp/$id | 
                  
                          |   | 54 | }}} | 
                  
                          |   | 55 |  * 接著,讓我們在本地端先驗證一下 testmapper.sh 的運作 | 
                  
                          |   | 56 | {{{ | 
                  
                          |   | 57 | ~$ head -n 2 sample-01.txt > sample-00.txt | 
                  
                          |   | 58 | ~$ cat > testmapper.sh << EOF | 
                  
                          |   | 59 | #!/bin/bash | 
                  
                          |   | 60 |  | 
                  
                          |   | 61 | id="`whoami`" | 
                  
                          |   | 62 | mkdir -p /tmp/\$id | 
                  
                          |   | 63 | host=\`hostname\` | 
                  
                          |   | 64 | pwd=\`pwd\` | 
                  
                          |   | 65 | uid=\`whoami\` | 
                  
                          |   | 66 |  | 
                  
                          |   | 67 | while read line; do  | 
                  
                          |   | 68 |   input=\$line | 
                  
                          |   | 69 |   filename=\`basename \$input\` | 
                  
                          |   | 70 |   echo "\$uid@\$host:\$pwd> hadoop fs -get \$input /tmp/\$id/\$filename" | 
                  
                          |   | 71 |   echo "\$uid@\$host:\$pwd> velveth output-\$filename 17 -fasta -short /tmp/\$id/\$filename" | 
                  
                          |   | 72 |   echo "\$uid@\$host:\$pwd> hadoop fs -put output-\$filename ." | 
                  
                          |   | 73 | done | 
                  
                          |   | 74 | rm -rf /tmp/\$id | 
                  
                          |   | 75 | EOF | 
                  
                          |   | 76 | ~$ chmod a+x testmapper.sh | 
                  
                          |   | 77 | ~$ cat sample-00.txt | ./testmapper.sh  | 
                  
                          |   | 78 | h998@hadoop:/home/h998> hadoop fs -get /user/h998/sample-1.fa /tmp/h998/sample-1.fa | 
                  
                          |   | 79 | h998@hadoop:/home/h998> velveth output-sample-1.fa 17 -fasta -short /tmp/h998/sample-1.fa | 
                  
                          |   | 80 | h998@hadoop:/home/h998> hadoop fs -put output-sample-1.fa . | 
                  
                          |   | 81 | h998@hadoop:/home/h998> hadoop fs -get /user/h998/sample-2.fa /tmp/h998/sample-2.fa | 
                  
                          |   | 82 | }}} | 
                  
                          |   | 83 |  * 讓我們用 Hadoop Streaming 的方式來執行 testmapper.sh | 
                  
                          |   | 84 | {{{ | 
                  
                          |   | 85 | ~$ hadoop jar hadoop-streaming.jar -input lab9_input -output lab9_out1 -mapper testmapper.sh -file testmapper.sh  | 
                  
                          |   | 86 | }}} | 
                  
                          |   | 87 |  * 觀察 lab9_out1 的結果,看與本機執行有何不同呢? | 
                  
                          |   | 88 | {{{ | 
                  
                          |   | 89 | ~$ hadoop fs -cat /user/$(whoami)/lab9_out1/part-00000 | head | 
                  
                          |   | 90 | hadoop@hadoop104:/var/lib/hadoop/cache/hadoop/mapred/local/taskTracker/jobcache/job_201106041247_1820/attempt_201106041247_1820_m_000002_0/work> hadoop fs -get /user/h998/sample-60.fa /tmp/h998/sample-60.fa   | 
                  
                          |   | 91 | hadoop@hadoop104:/var/lib/hadoop/cache/hadoop/mapred/local/taskTracker/jobcache/job_201106041247_1820/attempt_201106041247_1820_m_000002_0/work> hadoop fs -get /user/h998/sample-61.fa /tmp/h998/sample-61.fa   | 
                  
                          |   | 92 | hadoop@hadoop104:/var/lib/hadoop/cache/hadoop/mapred/local/taskTracker/jobcache/job_201106041247_1820/attempt_201106041247_1820_m_000002_0/work> hadoop fs -get /user/h998/sample-62.fa /tmp/h998/sample-62.fa   | 
                  
                          |   | 93 | }}} | 
                  
                          |   | 94 |  | 
                  
                          |   | 95 | == 實作透過 Hadoop Streaming 執行 99 組 velvet 運算 == | 
                  
                          |   | 96 |  | 
                  
                          |   | 97 |  * 撰寫 mapper.sh | 
                  
                          |   | 98 | {{{ | 
                  
                          |   | 99 | #!sh | 
                  
                          |   | 100 | #!/bin/bash | 
                  
                          |   | 101 |  | 
                  
                          |   | 102 | id="h998" | 
                  
                          |   | 103 | mkdir -p /tmp/$id | 
                  
                          |   | 104 | host=`hostname` | 
                  
                          |   | 105 | pwd=`pwd` | 
                  
                          |   | 106 | uid=`whoami` | 
                  
                          |   | 107 |  | 
                  
                          |   | 108 | while read line; do  | 
                  
                          |   | 109 |   input=$line | 
                  
                          |   | 110 |   filename=`basename $input` | 
                  
                          |   | 111 |   echo "$uid@$host> hadoop fs -get $input /tmp/$id/$filename" | 
                  
                          |   | 112 |   hadoop fs -get $input /tmp/$id/$filename | 
                  
                          |   | 113 |   echo "$uid@$host> velveth output-$filename 17 -fasta -short /tmp/$id/$filename" | 
                  
                          |   | 114 |   velveth output-$filename 17 -fasta -short /tmp/$id/$filename | 
                  
                          |   | 115 |   echo "$uid@$host> hadoop fs -put output-$filename /user/$id/." | 
                  
                          |   | 116 |   hadoop fs -put output-$filename /user/$id/. | 
                  
                          |   | 117 |   hadoop fs -chown $id /user/$id/output-$filename | 
                  
                          |   | 118 | done | 
                  
                          |   | 119 | rm -rf /tmp/$id | 
                  
                          |   | 120 | }}} | 
                  
                          |   | 121 |  * 於本機測試 mapper.sh | 
                  
                          |   | 122 | {{{ | 
                  
                          |   | 123 | ~$ cat > mapper.sh << EOF | 
                  
                          |   | 124 | #!/bin/bash | 
                  
                          |   | 125 |  | 
                  
                          |   | 126 | id="`whoami`" | 
                  
                          |   | 127 | mkdir -p /tmp/\$id | 
                  
                          |   | 128 | host=\`hostname\` | 
                  
                          |   | 129 | pwd=\`pwd\` | 
                  
                          |   | 130 | uid=\`whoami\` | 
                  
                          |   | 131 |  | 
                  
                          |   | 132 | while read line; do  | 
                  
                          |   | 133 |   input=\$line | 
                  
                          |   | 134 |   filename=\`basename \$input\` | 
                  
                          |   | 135 |   echo "\$uid@\$host> hadoop fs -get \$input /tmp/\$id/\$filename" | 
                  
                          |   | 136 |   hadoop fs -get \$input /tmp/\$id/\$filename | 
                  
                          |   | 137 |   echo "\$uid@\$host> velveth output-\$filename 17 -fasta -short /tmp/\$id/\$filename" | 
                  
                          |   | 138 |   velveth output-\$filename 17 -fasta -short /tmp/\$id/\$filename | 
                  
                          |   | 139 |   echo "\$uid@\$host> hadoop fs -put output-\$filename /user/\$id/." | 
                  
                          |   | 140 |   hadoop fs -put output-\$filename /user/\$id/. | 
                  
                          |   | 141 |   hadoop fs -chown \$id /user/\$id/output-\$filename | 
                  
                          |   | 142 | done | 
                  
                          |   | 143 | rm -rf /tmp/\$id | 
                  
                          |   | 144 | EOF | 
                  
                          |   | 145 | ~$ chmod a+x mapper.sh | 
                  
                          |   | 146 | ~$ cat sample-00.txt | ./mapper.sh  | 
                  
                          |   | 147 | ~$ hadoop fs -rmr output-* | 
                  
                          |   | 148 | ~$ rm -rf output-sample-* | 
                  
                          |   | 149 | }}} | 
                  
                          |   | 150 |  * 接著用 hadoop streaming 來執行 | 
                  
                          |   | 151 | {{{ | 
                  
                          |   | 152 | ~$ hadoop jar hadoop-streaming.jar -input lab9_input -output lab9_out2 -mapper mapper.sh -file mapper.sh | 
                  
                          |   | 153 | }}} | 
                  
                          |   | 154 |  |