| | 1 | |
| | 2 | = 編譯 = |
| | 3 | |
| | 4 | 下載解壓縮 nutch-1.2 (目前用 nutch-1.2-bin.tar.gz) |
| | 5 | {{{ |
| | 6 | cd $nutch-1.2/ |
| | 7 | vim src/java/org/apache/nutch/analysis/NutchAnalysis.jj |
| | 8 | }}} |
| | 9 | |
| | 10 | {{{ |
| | 11 | #!text |
| | 12 | | <SIGRAM: (<CJK>)+ > |
| | 13 | }}} |
| | 14 | |
| | 15 | * 用編譯器 javacc 編譯出七個java檔 |
| | 16 | |
| | 17 | {{{ |
| | 18 | CharStream.java NutchAnalysisTokenManager.java |
| | 19 | TokenMgrError.java |
| | 20 | NutchAnalysisConstants.java ParseException.java |
| | 21 | NutchAnalysis.java Token.java |
| | 22 | }}} |
| | 23 | |
| | 24 | {{{ |
| | 25 | cd $nutch-1.2/src/java/org/apache/nutch/analysis |
| | 26 | javacc -OUTPUT_DIRECTORY=./ika/ NutchAnalysis.jj |
| | 27 | mv ./ika/* ./ ; rmdir ika; |
| | 28 | |
| | 29 | }}} |
| | 30 | |
| | 31 | * 編譯剛編出來的 !NutchAnalysis.java |
| | 32 | |
| | 33 | {{{ |
| | 34 | vim $nutch-1.2/src/java/org/apache/nutch/analysis/NutchAnalysis.java |
| | 35 | }}} |
| | 36 | |
| | 37 | |
| | 38 | * 加入ParseException (共兩處): |
| | 39 | |
| | 40 | {{{ |
| | 41 | #!text |
| | 42 | public static Query parseQuery(....) throws IOException,ParseException |
| | 43 | }}} |
| | 44 | |
| | 45 | /opt/nutch-1.2/src/java/org/apache/nutch/searcher/Query.java |
| | 46 | |
| | 47 | {{{ |
| | 48 | #!java |
| | 49 | (:456) |
| | 50 | public static Query parse(String queryString, String queryLang, Configuration conf) |
| | 51 | throws IOException { |
| | 52 | Query que; |
| | 53 | try { |
| | 54 | que = fixup(NutchAnalysis.parseQuery( |
| | 55 | queryString, AnalyzerFactory.get(conf).get(queryLang), conf), conf); |
| | 56 | }catch (org.apache.nutch.analysis.ParseException e){ |
| | 57 | que = new Query(); |
| | 58 | } |
| | 59 | return que; |
| | 60 | } |
| | 61 | }}} |
| | 62 | |
| | 63 | |
| | 64 | * 下載 IKAnalyzer3.2.8.jar (2011/07/29) 解壓縮 |
| | 65 | [http://code.google.com/p/ik-analyzer/downloads/list] |
| | 66 | |
| | 67 | nutch-1.2 用的是 lucene-core-3.0.1.jar , 因此對應 ikanalyzer 為 3.2.8 版本 |
| | 68 | |
| | 69 | |
| | 70 | || 3.1.6GA || 兼容 2.9.1 及先前版本 ||对 solr1.3、solr1.4 提供接口实现 || |
| | 71 | || 3.2.0G 及后续版本 || 兼容 Lucene2.9 及 3.0 版本 || 仅对 solr1.4 提供接口实现 |
| | 72 | 丌支持 Lucene2.4 及先前版本 || |
| | 73 | |
| | 74 | |
| | 75 | IKAnalyzer3.2.8 bin.zip 內的 IKAnalyzer3.2.8.jar 解壓縮出來,分別放到以下資料夾 |
| | 76 | |
| | 77 | {{{ |
| | 78 | cp IKAnalyzer3.2.8.jar $nutch-1.2/lib/ |
| | 79 | cp IKAnalyzer3.2.8.jar $my_nutch_dir/lib/ |
| | 80 | cp IKAnalyzer3.2.8.jar $my_tomcat_dir/webapps/ROOT/WEB-INF/lib |
| | 81 | }}} |
| | 82 | |
| | 83 | * 修改 NutchDocumentAnalyzer.java 程式碼 |
| | 84 | |
| | 85 | {{{ |
| | 86 | vim src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java |
| | 87 | }}} |
| | 88 | |
| | 89 | 將 |
| | 90 | |
| | 91 | {{{ |
| | 92 | #!text |
| | 93 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| | 94 | Analyzer analyzer; |
| | 95 | if ("anchor".equals(fieldName)) |
| | 96 | analyzer = ANCHOR_ANALYZER; |
| | 97 | else |
| | 98 | analyzer = CONTENT_ANALYZER; |
| | 99 | |
| | 100 | return analyzer.tokenStream(fieldName, reader); |
| | 101 | } |
| | 102 | }}} |
| | 103 | |
| | 104 | 改成 |
| | 105 | |
| | 106 | {{{ |
| | 107 | #!text |
| | 108 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| | 109 | Analyzer analyzer; |
| | 110 | if ("anchor".equals(fieldName)) |
| | 111 | analyzer = ANCHOR_ANALYZER; |
| | 112 | else |
| | 113 | //analyzer = CONTENT_ANALYZER; |
| | 114 | analyzer = new org.wltea.analyzer.lucene.IKAnalyzer(); |
| | 115 | return analyzer.tokenStream(fieldName, reader); |
| | 116 | } |
| | 117 | }}} |
| | 118 | |
| | 119 | |
| | 120 | |
| | 121 | * 修改 build.xml |
| | 122 | |
| | 123 | {{{ |
| | 124 | #!text |
| | 125 | <include name="IKAnalyzer*.jar"/> |
| | 126 | }}} |
| | 127 | |
| | 128 | |
| | 129 | |
| | 130 | = 佈署 = |
| | 131 | |
| | 132 | * 重新編譯 nutch 並產生 nutch-job-1.2.job |
| | 133 | |
| | 134 | {{{ |
| | 135 | ant |
| | 136 | }}} |
| | 137 | |
| | 138 | * build/ 目錄裡面的 nutch-job-1.2.job 就是重編後的核心 |
| | 139 | {{{ |
| | 140 | ant jar; ant war; |
| | 141 | }}} |
| | 142 | |
| | 143 | |
| | 144 | * 將nutch-job-1.2.jar複製到我的nutchez資料夾內取代使用 |
| | 145 | |
| | 146 | |
| | 147 | * 最後用nutch 的 crawl 抓取網頁,搜索的結果就是按ik分過的中文詞 |
| | 148 | |
| | 149 | |
| | 150 | == 補充選項:加入字典檔 == |
| | 151 | |
| | 152 | |
| | 153 | |
| | 154 | 1. 編輯 IKAnalyzer.cfg.xml |
| | 155 | |
| | 156 | <properties> |
| | 157 | <comment>IK Analyzer</comment> |
| | 158 | <entry key="ext_dict">/cyc.dic</entry> |
| | 159 | </properties> |
| | 160 | |
| | 161 | |
| | 162 | 2. 編輯你的字典檔 cyc.dic ,一行一個關鍵字,如: |
| | 163 | |
| | 164 | 數學 |
| | 165 | 嘉義縣網 |
| | 166 | |
| | 167 | 3. 用解壓縮工具打開 /opt/crawlzilla/nutch/nutch-1.2.job,塞入 cyc.dic 與 IKAnalyzer.cfg.xml |
| | 168 | |
| | 169 | 4. 重新啟動crawlzilla 的所有服務 |
| | 170 | |
| | 171 | |
| | 172 | 之後抓的索引庫就有該中文分詞了了 |
| | 173 | |
| | 174 | 補充: |
| | 175 | 如果有兩個字典檔以上的話,可以一起放到 nutch-1.0.job 的壓縮檔內, 修改 IKAnalyzer.cfg.xml |
| | 176 | ,加入字典檔, 每個字典檔各用分號區隔。 |
| | 177 | 如 |
| | 178 | <entry key="ext_dict">/cyc.dic;/cyc2.dic</entry> |