| 26 | | Enable the plugin by updating "nutch-site.xml" file found in NUTCHHOME/conf directory |
| | 29 | 6. 進行 nutch 爬取 |
| | 30 | {{{ |
| | 31 | #!sh |
| | 32 | #!/bin/bash |
| | 33 | crawl_dep=$1 |
| | 34 | echo $1 |
| | 35 | function debug_echo () { |
| | 36 | if [ $? -eq 0 ]; then |
| | 37 | echo "$1 finished " |
| | 38 | else |
| | 39 | echo "$1 is error" |
| | 40 | exit |
| | 41 | fi |
| | 42 | } |
| | 43 | source /opt/nutchez/nutch/conf/hadoop-env.sh |
| | 44 | debug_echo "import hadoop-env.sh" |
| | 45 | echo "delete search (local,hdfs) and urls (hdfs) " |
| | 46 | rm -rf /home/nutchuser/nutchez/search |
| | 47 | /opt/nutchez/nutch/bin/hadoop dfs -rmr urls search |
| | 48 | /opt/nutchez/nutch/bin/hadoop dfs -put /home/nutchuser/nutchez/urls urls |
| | 49 | # |
| | 50 | /opt/nutchez/nutch/bin/nutch crawl urls -dir search -depth $crawl_dep -topN 5000 -threads 1000 |
| | 51 | debug_echo "nutch crawl" |
| | 52 | # |
| | 53 | /opt/nutchez/nutch/bin/hadoop dfs -get search /home/nutchuser/nutchez/search |
| | 54 | debug_echo "download search" |
| | 55 | # |
| | 56 | /opt/nutchez/tomcat/bin/shutdown.sh |
| | 57 | /opt/nutchez/tomcat/bin/startup.sh |
| | 58 | debug_echo "tomcat restart" |
| | 59 | }}} |
| | 63 | {{{ |
| | 64 | #!txt |
| | 65 | 2010-05-27 14:07:19,417 WARN org.apache.nutch.crawl.Injector: Skipping smb://140.110.138.179/share:java.net.MalformedURLException: unknown protocol: smb |
| | 66 | }}} |
| | 67 | |
| | 68 | * 試著用以下方法解決: |
| | 69 | {{{ |
| | 70 | #!txt |
| | 71 | a) a short term solutions will be to installed the JCIFS jar |
| | 72 | library found in protocol-smb folder in |
| | 73 | JDKHOME/jre/lib/ext and (or) JREHOME/lib/ext |
| | 74 | |
| | 75 | b) After completing step a), if the exeception is still thrown |
| | 76 | set the System properties by passing the following arguments |
| | 77 | to the JVM: |
| | 78 | |
| | 79 | -Djava.protocol.handler.pkgs=jcifs |
| | 80 | |
| | 81 | c) You can set the property also in your Code for example if |
| | 82 | you start Crawling with org.apache.nutch.crawl.Crawl |
| | 83 | Add the following two lines. This will be the Same like in b) |
| | 84 | public static void main(String args[]) throws Exception { |
| | 85 | System.setProperty("java.protocol.handler.pkgs", "jcifs"); |
| | 86 | new java.util.PropertyPermission("java.protocol.handler.pkgs","read, write") |
| | 87 | //and so on |
| | 88 | |
| | 89 | Also you can visit the FAQ page: http://jcifs.samba.org/src/docs/faq.html |
| | 90 | }}} |
| | 91 | |
| | 92 | 但是此warn 還是沒有解決,以至沒有入口點。於是到 http://jcifs.samba.org/src/docs/faq.html |
| | 93 | |
| | 94 | 將jcifs專案單獨測試, |
| | 95 | {{{ |
| | 96 | #!java |
| | 97 | import java.net.MalformedURLException; |
| | 98 | import java.text.SimpleDateFormat; |
| | 99 | import java.util.Date; |
| | 100 | import java.util.GregorianCalendar; |
| | 101 | |
| | 102 | import jcifs.smb.NtlmAuthenticator; |
| | 103 | import jcifs.smb.NtlmPasswordAuthentication; |
| | 104 | import jcifs.smb.SmbException; |
| | 105 | import jcifs.smb.SmbFile; |
| | 106 | |
| | 107 | public class test { |
| | 108 | |
| | 109 | /** |
| | 110 | * @param args |
| | 111 | * @throws MalformedURLException |
| | 112 | * @throws SmbException |
| | 113 | */ |
| | 114 | public static void main(String[] args) throws MalformedURLException, SmbException { |
| | 115 | // TODO Auto-generated method stub |
| | 116 | String domain = "WORKSTATION"; |
| | 117 | String username = "waue"; |
| | 118 | String password = "cccccc"; |
| | 119 | String server = "140.110.138.179"; |
| | 120 | String share = "share"; |
| | 121 | String directory = "."; |
| | 122 | SmbFile[] files = new SmbFile[0]; |
| | 123 | |
| | 124 | NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(domain, |
| | 125 | username, password); |
| | 126 | String smburl = String.format("smb://%s/%s/%s/", server, share, directory); |
| | 127 | // SmbFile file = new SmbFile(smburl, auth); |
| | 128 | SmbFile file = new SmbFile(smburl); |
| | 129 | files = file.listFiles(); |
| | 130 | System.err.println("file : "); |
| | 131 | for (SmbFile fi : files){ |
| | 132 | System.err.println(fi.getName()); |
| | 133 | } |
| | 134 | } |
| | 135 | } |
| | 136 | }}} |
| | 137 | |
| | 138 | 得到結果 |
| | 139 | |
| | 140 | {{{ |
| | 141 | file : |
| | 142 | 【影片】/ |
| | 143 | 人月神話.pdf |
| | 144 | 其他/ |
| | 145 | 【音樂】/ |
| | 146 | test.txt |
| | 147 | 【軟體】/ |
| | 148 | 【照片】/ |
| | 149 | 【遊戲】/ |
| | 150 | }}} |
| | 151 | |
| | 152 | 證明此jcifs 在我的電腦可以 work,因此是 protocal-smb 與 nutch 之間的問題 |