|
Last change
on this file since 247 was
247,
checked in by shunfa, 15 years ago
|
|
將go.sh複製到nutchez/system/目錄下
|
-
Property svn:executable set to
*
|
|
File size:
1.1 KB
|
| Line | |
|---|
| 1 | #!/bin/bash |
|---|
| 2 | crawl_dep=$1 |
|---|
| 3 | |
|---|
| 4 | if [ "$1" == "" ]; then |
|---|
| 5 | echo "1. 使用這個shell ,首先你需要有nutchuser這個使用者,並且hadoop 已經開始運作"; |
|---|
| 6 | echo "2. /home/nutchuser/nutchez/url/urls.txt 裡面有你要抓的網址"; |
|---|
| 7 | echo "3. 執行 ./go.sh [深度] 即可,如:"; |
|---|
| 8 | echo " ./go.sh 3" |
|---|
| 9 | exit |
|---|
| 10 | fi |
|---|
| 11 | |
|---|
| 12 | function debug_echo () { |
|---|
| 13 | |
|---|
| 14 | if [ $? -eq 0 ]; then |
|---|
| 15 | echo "$1 finished " |
|---|
| 16 | else |
|---|
| 17 | echo "$1 is error" |
|---|
| 18 | exit |
|---|
| 19 | fi |
|---|
| 20 | } |
|---|
| 21 | |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | source /opt/nutchez/nutch/conf/hadoop-env.sh |
|---|
| 25 | |
|---|
| 26 | debug_echo "import hadoop-env.sh" |
|---|
| 27 | |
|---|
| 28 | echo "delete search (local,hdfs) and urls (hdfs) " |
|---|
| 29 | |
|---|
| 30 | rm -rf /home/nutchuser/nutchez/search |
|---|
| 31 | |
|---|
| 32 | /opt/nutchez/nutch/bin/hadoop dfs -rmr urls search |
|---|
| 33 | |
|---|
| 34 | /opt/nutchez/nutch/bin/hadoop dfs -put /home/nutchuser/nutchez/urls urls |
|---|
| 35 | |
|---|
| 36 | # |
|---|
| 37 | |
|---|
| 38 | /opt/nutchez/nutch/bin/nutch crawl urls -dir search -depth $crawl_dep -topN 5000 -threads 1000 |
|---|
| 39 | |
|---|
| 40 | debug_echo "nutch crawl" |
|---|
| 41 | |
|---|
| 42 | # |
|---|
| 43 | |
|---|
| 44 | /opt/nutchez/nutch/bin/hadoop dfs -get search /home/nutchuser/nutchez/search |
|---|
| 45 | |
|---|
| 46 | debug_echo "download search" |
|---|
| 47 | |
|---|
| 48 | # |
|---|
| 49 | |
|---|
| 50 | /opt/nutchez/tomcat/bin/shutdown.sh |
|---|
| 51 | |
|---|
| 52 | /opt/nutchez/tomcat/bin/startup.sh |
|---|
| 53 | |
|---|
| 54 | |
|---|
| 55 | debug_echo "tomcat restart" |
|---|
Note: See
TracBrowser
for help on using the repository browser.