#!/bin/bash # Author: WeiYu Chen # License: GPL # Description: Eazily use for Nutch # . #DIALOG='dialog --backtitle " NutchEz Setup Menu -- powered by NCHC "' DIALOG=dialog # set 1 to display more for debug, VERB=0 init_nutchez () { if ! [ -e ~/.nutchez ] ;then # copy from /etc/nutch mkdir ~/.nutchez cp -rf /etc/nutch/conf ~/.nutchez/ mkdir ~/.nutchez/log cp -rf /etc/nutch/sav ~/.nutchez/ chown -R $LOGNAME:$LOGNAME ~/.nutchez fi } echo_dialog_v () { if [[ $VERB -eq 1 ]]; then $DIALOG --msgbox "$1" 16 51 fi } test_file () { if ! test -e "$1" ; then echo_dialog_v "test_file: \n can not find $1" echo "" > $1 else echo_dialog_v "test_file: \n Touch $1 ! \n Its content is \n `cat $1`" fi } check_if_root() { if [[ ! "$UID" -eq "0" ]]; then echo_dialog_v "Hi [$LOGNAME] !! " echo_dialog_v "You need to run this script \"`basename $0`\" as root." exit 1 fi } promote_tempfile () { echo_dialog_v "7. chang tmp as txt" rm -f ~/.nutchez/sav/n.*.txt mv /tmp/n.urls.tmp ~/.nutchez/sav/n.urls.txt mv /tmp/n.robot.tmp ~/.nutchez/sav/n.robot.txt mv /tmp/n.crawler.tmp ~/.nutchez/sav/n.crawler.txt mv /tmp/n.tomcat.tmp ~/.nutchez/sav/n.tomcat.txt rm -f /tmp/n.*.tmp } clean_tempfile () { echo_dialog_v "7. delete tmp" rm -f /tmp/n.*.tmp } setup_nutchez () { if ! [ -e ~/.nutchez/urls ] ; then # make url list dir mkdir ~/.nutchez/urls fi if [ -e ~/.nutchez/urls/urls.txt ] ; then rm ~/.nutchez/urls/urls.txt fi cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt if ! [ -z $NOCONTINUE ]; then if [[ $NOCONTINUE -eq 1 ]]; then echo_dialog_v " delete the ~/.nutchez/search/*" DATE=`date +%Y%m%d%H%M%S` mv ~/.nutchez/search ~/.nutchez/search-$DATE $DIALOG --msgbox "上次搜尋的結果改放到 ~/.nutchez/search-$DATE " 0 0 # rm -f /tmp/search # rm -rf ~/.nutchez/search/* fi fi if [ -e ~/.nutchez/conf/nutch-site.xml ] ; then # set nutch-site.xml sed -i -e "4s/[a-zA-Z0-9]*$ROBOT[a-zA-Z0-9/]*<|$HOMEDIR<|" ~/.nutchez/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml fi # change explorer port sed -i -e "67s//tmp/n.urls.tmp RET=$? echo_dialog_v "2.1 cat url: `cat /tmp/n.urls.tmp`" return $RET } setup_robot () { test_file ~/.nutchez/sav/n.robot.txt echo_dialog_v "3. setup_robot" # dialog dialog --title "設定機器人名稱" --nocancel --inputbox " 這個爬網機器人,你要將他取名為:\n\n (ps: 這個設定只是禮貌性宣告,並不會對結果造成影響) \n" 16 55 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp echo_dialog_v "3.1 cat robot : `cat /tmp/n.robot.tmp`" } setup_crawler () { echo_dialog_v "4. setup_crawler" test_file ~/.nutchez/sav/n.crawler.txt dialog --title "設定抓取深度" --nocancel --inputbox " 對於每個網址,你需要NutchEz爬多深呢?\n\n (ps: 初次體驗建議將深度設為1來感受需要多久) \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp echo_dialog_v "4.1 cat robot : `cat /tmp/n.robot.tmp`" } setup_tomcat () { echo_dialog_v "5. setup_tomcat" test_file ~/.nutchez/sav/n.tomcat.txt dialog --title "設定網頁伺服器" --nocancel --inputbox " 你希望NutchEz將網頁伺服器開在哪個port \n\n (ps: 請選擇一個沒用到的port以免造成衝突 \n 也請盡量不要設成80以免造成你誤以為是apache的混淆) \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp echo_dialog_v "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`" } continue_previous () { echo_dialog_v "6. setup_tomcat" $DIALOG --title "清除上次搜尋" --clear \ --yesno "你是否要清除上一次爬網所得的結果,\n否則將加入到URL列裡增加搜尋負擔 \n\n ps: 選no的話,會跑相當相當久,\n 請慎重考慮之\n" 16 51 case $? in 0) NOCONTINUE=1;; 1) NOCONTINUE=0;; 255) echo "ESC pressed.";; esac echo_dialog_v " 6continue = $CONTINUE" } final_confirm () { echo_dialog_v "7. final_confirm : start =0 , back =1 " tempfile=/tmp/n.finalcheck.tmp echo " \n 1. 你所選擇要爬取的網址為 : \n " > $tempfile cat /tmp/n.urls.tmp >> $tempfile echo " \n\n 2. 對於這個爬網機器人,你取名為 : \n" >> $tempfile cat /tmp/n.robot.tmp >> $tempfile echo " \n\n 3. 爬網的深度,你設定為 : \n " >> $tempfile cat /tmp/n.crawler.tmp >> $tempfile echo " \n\n 4. NutchEz將會把你的搜尋結果呈現在這個Port : \n " >> $tempfile cat /tmp/n.tomcat.tmp >> $tempfile if [[ $NOCONTINUE -eq 0 ]];then echo " \n\n 5. 是否要清除上一次的收尋結果 : \n " >> $tempfile echo_dialog_v " 7continue = $CONTINUE" echo "NO" >> $tempfile elif [[ $NOCONTINUE -eq 1 ]];then echo " \n\n 5. 是否要清除上一次的收尋結果繼續搜尋 : \n " >> $tempfile echo_dialog_v " 7continue = $CONTINUE" echo "YES" >> $tempfile else echo_dialog_v " 無資料可匯入 " fi MSG=`cat $tempfile` echo_dialog_v "7.1 final message :\n $MSG" #read READ $DIALOG --title "請檢查你的選擇 ! \n\n 若所有的設定都是正確的,你可以按 \"ok\",\n 若你按了 \"reset\" 則會重頭開始設定, \n 若你選擇 \"exit\" 則會跳出NutchEz的設定選單 \n ps: reset 與 exit都不會把資料記成預設值,請放心使用 " --clear \ --extra-button --extra-label "reset" --ok-label "ok" --cancel-label "exit" \ --yesno "$MSG" 26 51 RET=$? echo_dialog_v "final return = $RET" return $RET } # define paramaters set_nutchez_p () { ROBOT=`cat ~/.nutchez/sav/n.robot.txt` URLS=`cat ~/.nutchez/sav/n.urls.txt` DEPTH=`cat ~/.nutchez/sav/n.crawler.txt` PORT=`cat ~/.nutchez/sav/n.tomcat.txt` } start_crawl () { echo_dialog_v "7. start_crawl" setup_nutchez install_tomcat echo_dialog_v "/opt/nutchez/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH" echo_dialog_v "nutch conf dir = $NUTCH_CONF_DIR" /opt/nutchez/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH -topN 5000 -threads 1000 } start_tomcat () { echo_dialog_v "8. start_tomcat " echo_dialog_v "/opt/nutchez/nutch/tomcat/bin/startup.sh" #if [ -e /tmp/search ];then # rm -rf /tmp/search #fi #ln -sf ~/.nutchez/search/ /tmp/ pid_tc=$(ps axw -eo pid,command |\ grep "catalina" | grep "java" |\ grep "start" | awk '{print $1}') if [ -z "$pid_tc" ]; then echo_dialog_v "no another tomcat is running" else echo_dialog_v "tomcat had been started and the pid is $pid_tc" echo_dialog_v "stop it first" kill -9 $pid_tc if [ -z $? ];then echo_dialog_v " tomcat ($pid_tc) is killed ..." else echo_dialog_v "kill error ..." fi fi echo "Starting Tomcat ...." ~/.nutchez/tomcat/bin/startup.sh sleep 3 } show_report () { echo_dialog_v "9. show_report " FIREFOX=`which firefox` RET=$? if [[ $RET -eq 0 ]];then $FIREFOX -D 0.0 http://localhost:$PORT RET=$? fi if ! [[ $RET -eq 0 ]];then $DIALOG --msgbox "恭喜你已經完成了! \n 你可以用瀏覽器瀏覽: \n http://host_ip:$PORT" 0 0 fi }