| | 96 | # prompt |
| | 97 | if [ "$1" == "" ];then |
| | 98 | echo "Usage : fix <JOB_NAME>"; |
| | 99 | echo " where JOB_NAME is one of: "; |
| | 100 | echo "===========" |
| | 101 | NN=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls |grep crawler |awk '{print $8}' | cut -d "/" -f 4) |
| | 102 | echo "$NN" |
| | 103 | echo "===========" |
| | 104 | exit 9; |
| | 105 | fi |
| | 106 | |
| | 107 | # begin |
| | 108 | |
| | 109 | JNAME=$1 |
| | 110 | LOGFILE=~/crawlzilla/debug_fix.log |
| | 111 | META_PATH=/home/crawler/crawlzilla/.tmp |
| | 112 | |
| | 113 | ### not test |
| | 114 | JPID="$META_PATH/$JNAME/$JNAME"_count_pid # go.sh need add go.sh's pid |
| | 115 | JDEPTH="$META_PATH/$JNAME/$JNAME"xxx # go.sh need fix |
| | 116 | JPTIME="$META_PATH/$JNAME/$JNAME"PassTime |
| | 117 | ### not test |
| | 118 | |
| | 119 | |
| | 120 | DATE=$(date) |
| | 121 | echo "$JNAME BEGINE at $DATE" >> $LOGFILE |
| | 122 | |
| | 123 | echo "1 invertlinks" >> $LOGFILE |
| | 124 | |
| | 125 | /opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/$JNAME/linkdb -dir /user/crawler/$JNAME/segments/ |
| | 126 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 127 | |
| | 128 | echo "2 index" >> $LOGFILE |
| | 129 | SEGS=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls /user/crawler/$JNAME/segments | grep segments | awk '{print $8 }') |
| | 130 | /opt/crawlzilla/nutch/bin/nutch index /user/crawler/$JNAME/index /user/crawler/$JNAME/crawldb /user/crawler/$JNAME/linkdb $SEGS |
| | 131 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 132 | |
| | 133 | echo "3 dedup" >> $LOGFILE |
| | 134 | /opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/$JNAME/index |
| | 135 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 136 | |
| | 137 | echo "4 download" >> $LOGFILE |
| | 138 | /opt/crawlzilla/nutch/bin/hadoop dfs -get $JNAME /home/crawler/crawlzilla/archieve/$JNAME |
| | 139 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 140 | |
| | 141 | echo "5 $JNAMEPassTime" >> $LOGFILE |
| | 142 | echo "0h:0m:0s" >> /home/crawler/crawlzilla/archieve/$JNAME/$JNAME"PassTime" |
| | 143 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 144 | |
| | 145 | echo "6 append depth" >> $LOGFILE |
| | 146 | echo "0" >> /home/crawler/crawlzilla/archieve/$JNAME/.crawl_depth |
| | 147 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 148 | |
| | 149 | echo "7 mv index files from part-00000" >> $LOGFILE |
| | 150 | mv /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/* /home/crawler/crawlzilla/archieve/$JNAME/index/ |
| | 151 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 152 | |
| | 153 | echo "8 rmdir part-00000/" >> $LOGFILE |
| | 154 | rmdir /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/ |
| | 155 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 156 | |
| | 157 | echo "9 tomcat" >> $LOGFILE |
| | 158 | cp -rf /opt/crawlzilla/tomcat/webapps/default /opt/crawlzilla/tomcat/webapps/$JNAME |
| | 159 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 160 | |
| | 161 | echo "10 nutch-site.xml" >> $LOGFILE |
| | 162 | sed -i '8s/search/'${JNAME}'/g' /opt/crawlzilla/tomcat/webapps/$JNAME/WEB-INF/classes/nutch-site.xml |
| | 163 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| | 164 | |
| | 165 | |
| | 166 | |
| | 167 | |
| | 168 | DATE=$(date) |
| | 169 | echo "$JNAME completed and finished at"$DATE >> $LOGFILE |
| | 170 | |
| | 171 | |
| | 172 | }}} |