Changes between Version 12 and Version 13 of Streaming


Ignore:
Timestamp:
Sep 13, 2009, 8:07:57 PM (15 years ago)
Author:
waue
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • Streaming

    v12 v13  
    4747 = 用php實做mapReduce =
    4848 * [http://www.hadoop.tw/2008/09/php-hadoop.html 用 "單機" 跟 "PHP" 開發 Hadoop 程式] from Hadoop Taiwan User Group
     49{{{
     50$ cd /opt/hadoop/
     51$ sudo apt-get install php5-cli
     52}}}
    4953
     54{{{
     55$ gedit mapper.php
     56}}}
     57
     58{{{
     59#!php
     60#!/usr/bin/php
     61<?php
     62
     63$word2count = array();
     64
     65// 標準輸入為 STDIN (standard input)
     66while (($line = fgets(STDIN)) !== false) {
     67   // 移除小寫與空白
     68   $line = strtolower(trim($line));
     69   // 將行拆解成各個字於words 陣列中
     70   $words = preg_split('/\W/', $line, 0, PREG_SPLIT_NO_EMPTY);
     71   // 將字+1
     72   foreach ($words as $word) {
     73       $word2count[$word] += 1;
     74   }
     75}
     76
     77// 將結果寫到 STDOUT (standard output)
     78foreach ($word2count as $word => $count) {
     79   // 印出 [字 , "tab符號" ,  "數字" , "結束字元"]
     80   echo $word, chr(9), $count, PHP_EOL;
     81}
     82?>
     83}}}
     84
     85{{{
     86$ gedit reducer.php
     87}}}
     88
     89{{{
     90#!php
     91#!/usr/bin/php
     92<?php
     93
     94$word2count = array();
     95
     96// 輸入為 STDIN
     97while (($line = fgets(STDIN)) !== false) {
     98    // 移除多餘的空白
     99    $line = trim($line);
     100    // 每一行的格式為 (單字 "tab" 數字) ,紀錄到($word, $count)
     101    list($word, $count) = explode(chr(9), $line);
     102    // 轉換格式string -> int
     103    $count = intval($count);
     104    // 加總
     105    if ($count > 0) $word2count[$word] += $count;
     106}
     107
     108// 此行不必要,但可讓output排列更完整
     109ksort($word2count);
     110
     111// 將結果寫到 STDOUT (standard output)
     112foreach ($word2count as $word => $count) {
     113    echo $word, chr(9), $count, PHP_EOL;
     114}
     115
     116?>
     117}}}
     118
     119{{{
     120$ chmod 755 *.php
     121$ echo "i love hadoop, hadoop love u" | ./mapper.php | ./reducer.php
     122$ bin/hadoop jar hadoop/contrib/hadoop-*-streaming.jar -mapper mapper.php -reducer reducer.php -input lab4_input -output stream_out2
     123}}}
     124
     125 * 檢查結果
     126
     127{{{
     128$ haddop dfs -cat stream_out2/part-00000 | more
     129}}}
    50130
    51131 = Python 實做 =