Changes between Initial Version and Version 1 of FDC110829/Lab5


Ignore:
Timestamp:
Aug 28, 2011, 11:03:47 PM (13 years ago)
Author:
jazz
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • FDC110829/Lab5

    v1 v1  
     1[[PageOutline]]
     2
     3◢ <[wiki:FDC110829/Lab4 實作四]> | <[wiki:FDC110829 回課程大綱]> ▲ | <> ◣
     4
     5= 實作五 Lab 5 =
     6
     7{{{
     8#!html
     9<div style="text-align: center;"><big style="font-weight: bold;"><big><br/>Hadoop Streaming in different Language</big></big></div>
     10}}}
     11
     12== Existing Binary ==
     13
     14{{{
     15~$ hadoop fs -put /etc/hadoop/conf lab7_input
     16~$ hadoop jar hadoop-streaming.jar -input lab7_input -output lab7_out1 -mapper /bin/cat -reducer /usr/bin/wc
     17~$ hadoop fs -cat lab7_out1/part-00000
     18}}}
     19
     20== Bash Shell Script ==
     21
     22{{{
     23~$ echo "sed -e \"s/ /\n/g\" | grep ." > streamingMapper.sh
     24~$ echo "uniq -c | awk '{print \$2 \"\t\" \$1}'" > streamingReducer.sh
     25~$ chmod a+x streamingMapper.sh
     26~$ chmod a+x streamingReducer.sh
     27~$ hadoop jar hadoop-streaming.jar -input lab7_input -output lab7_out2 -mapper streamingMapper.sh -reducer streamingReducer.sh -file streamingMapper.sh -file streamingReducer.sh
     28~$ hadoop fs -cat lab7_out2/part-00000
     29}}}
     30
     31== PHP Script ==
     32
     33 * 編輯 mapper 的 php 程式
     34{{{
     35~$ cat > mapper.php << EOF
     36#!/usr/bin/php
     37<?php
     38
     39\$word2count = array();
     40
     41// 標準輸入為 STDIN (standard input)
     42while ((\$line = fgets(STDIN)) !== false) {
     43   // 移除小寫與空白
     44   \$line = strtolower(trim(\$line));
     45   // 將行拆解成各個字於 words 陣列中
     46   \$words = preg_split('/\W/', \$line, 0, PREG_SPLIT_NO_EMPTY);
     47   // 將字 +1
     48   foreach (\$words as \$word) {
     49       \$word2count[\$word] += 1;
     50   }
     51}
     52
     53// 將結果寫到 STDOUT (standard output)
     54foreach (\$word2count as \$word => \$count) {
     55   // 印出 [字 , "tab符號" ,  "數字" , "結束字元"]
     56   echo \$word, chr(9), \$count, PHP_EOL;
     57}
     58?>
     59EOF
     60}}}
     61 * 編輯 reduce 的 php 程式
     62{{{
     63~$ cat > reducer.php << EOF
     64#!/usr/bin/php
     65<?php
     66
     67\$word2count = array();
     68
     69// 輸入為 STDIN
     70while ((\$line = fgets(STDIN)) !== false) {
     71    // 移除多餘的空白
     72    \$line = trim(\$line);
     73    // 每一行的格式為 (單字 "tab" 數字) ,紀錄到(\$word, \$count)
     74    list(\$word, \$count) = explode(chr(9), \$line);
     75    // 轉換格式string -> int
     76    \$count = intval(\$count);
     77    // 加總
     78    if (\$count > 0) \$word2count[\$word] += \$count;
     79}
     80
     81// 此行不必要,但可讓output排列更完整
     82ksort(\$word2count);
     83
     84// 將結果寫到 STDOUT (standard output)
     85foreach (\$word2count as \$word => \$count) {
     86    echo \$word, chr(9), \$count, PHP_EOL;
     87}
     88?>
     89EOF
     90}}}
     91 * 修改執行權限
     92{{{
     93~$ chmod a+x *.php
     94}}}
     95 * 測試是否能運作
     96{{{
     97~$ echo "i love hadoop, hadoop love u" | ./mapper.php | ./reducer.php
     98}}}
     99 * 開始執行
     100{{{
     101~$ hadoop jar hadoop-streaming.jar -mapper mapper.php -reducer reducer.php -input lab7_input -output lab7_out3 -file mapper.php -file reducer.php
     102}}}
     103 * 檢查結果
     104{{{
     105~$ hadoop fs -cat lab7_out3/part-00000
     106}}}