| | 1 | {{{ |
| | 2 | #!html |
| | 3 | <div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big> |
| | 4 | 讓 Nutch 支援中文分詞 方法教學 |
| | 5 | </big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big> |
| | 6 | Nutch 1.0 + IK-Analyzer 3.1.6 中文分詞庫 |
| | 7 | </big></big></div> |
| | 8 | }}} |
| | 9 | [[PageOutline]] |
| | 10 | |
| | 11 | = 前言 = |
| | 12 | |
| | 13 | * Nutch 1.0 + IK-Analyzer 3.1.6 中文分詞庫 的詳細方法 |
| | 14 | * 本篇參考 [http://zha-zi.javaeye.com/blog/625619 nutch-1.0中文分词(原文連結)],並且修正與補充 |
| | 15 | * [http://ftp.twaren.net/Unix/Web/apache/nutch/ nutch 1.0 的下載位置] |
| | 16 | |
| | 17 | || nutch-1.0/ || 你下載nutch-1.0 並解壓縮的資料夾目錄位址 || 如:/home/user/nutch-1.0/ || |
| | 18 | |
| | 19 | = 1. 安裝 = |
| | 20 | * 安裝必要工具(java 已經安裝) |
| | 21 | {{{ |
| | 22 | sudo apt-get install javacc unrar ant |
| | 23 | }}} |
| | 24 | |
| | 25 | * 修改NutchAnalysis.jj 約130行左右的程式碼 (原本為:| <SIGRAM: <CJK> >) |
| | 26 | |
| | 27 | {{{ |
| | 28 | cd nutch-1.0/ |
| | 29 | vim src/java/org/apache/nutch/analysis/NutchAnalysis.jj |
| | 30 | }}} |
| | 31 | |
| | 32 | {{{ |
| | 33 | #!text |
| | 34 | | <SIGRAM: (<CJK>)+ > |
| | 35 | }}} |
| | 36 | |
| | 37 | * 用編譯器 javacc 編譯出七個java檔 |
| | 38 | {{{ |
| | 39 | #!text |
| | 40 | CharStream.java NutchAnalysisTokenManager.java TokenMgrError.java |
| | 41 | NutchAnalysisConstants.java ParseException.java |
| | 42 | NutchAnalysis.java Token.java |
| | 43 | }}} |
| | 44 | |
| | 45 | {{{ |
| | 46 | cd nutch-1.0/src/java/org/apache/nutch/analysis |
| | 47 | javacc -OUTPUT_DIRECTORY=./ika/ NutchAnalysis.jj |
| | 48 | mv ./ika/* ./ ; rm -rf ./ika/ |
| | 49 | }}} |
| | 50 | |
| | 51 | * 編譯剛編出來的 NutchAnalysis.java |
| | 52 | {{{ |
| | 53 | vim nutch-1.0/src/java/org/apache/nutch/analysis/NutchAnalysis.java |
| | 54 | }}} |
| | 55 | |
| | 56 | * 在第48行加入ParseException: |
| | 57 | {{{ |
| | 58 | #!text |
| | 59 | public static Query parseQuery(String queryString, Configuration conf) throws IOException,ParseException |
| | 60 | }}} |
| | 61 | * 在第54行加入ParseException: |
| | 62 | {{{ |
| | 63 | #!text |
| | 64 | throws IOException,ParseException { |
| | 65 | }}} |
| | 66 | |
| | 67 | * 把 IKAnalyzer3.1.6GA.jar 放到lib文件夹 |
| | 68 | * nutch 1.0 用的是 lucene 2.4.0 hadoop 0.19 |
| | 69 | * 因此ik分詞器最新僅能用 3.1.6ga (3.2.0GA 以上版本只支援lucene 2.9以上版本) |
| | 70 | |
| | 71 | {{{ |
| | 72 | cd nutch-1.0/ |
| | 73 | wget http://ik-analyzer.googlecode.com/files/IKAnalyzer3.1.6GA_AllInOne.rar |
| | 74 | mkdir ika |
| | 75 | unrar x ./IKAnalyzer3.1.6GA_AllInOne.rar ika/ |
| | 76 | cp ika/IKAnalyzer3.1.6GA.jar lib/ |
| | 77 | rm -rf IKAnalyzer3.1.6GA_AllInOne.rar ika/ |
| | 78 | }}} |
| | 79 | |
| | 80 | * 修改 NutchDocumentAnalyzer.java 程式碼 |
| | 81 | {{{ |
| | 82 | cd nutch-1.0/ |
| | 83 | vim src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java |
| | 84 | }}} |
| | 85 | 將 |
| | 86 | {{{ |
| | 87 | #!text |
| | 88 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| | 89 | Analyzer analyzer; |
| | 90 | if ("anchor".equals(fieldName)) |
| | 91 | analyzer = ANCHOR_ANALYZER; |
| | 92 | else |
| | 93 | analyzer = CONTENT_ANALYZER; |
| | 94 | |
| | 95 | return analyzer.tokenStream(fieldName, reader); |
| | 96 | } |
| | 97 | }}} |
| | 98 | 修改成 |
| | 99 | {{{ |
| | 100 | #!text |
| | 101 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| | 102 | Analyzer analyzer = new org.wltea.analyzer.lucene.IKAnalyzer(); |
| | 103 | return analyzer.tokenStream(fieldName, reader); |
| | 104 | } |
| | 105 | }}} |
| | 106 | |
| | 107 | * 修改 build.xml,在 <include name="log4j-*.jar"/> 下(約195行),加入 |
| | 108 | {{{ |
| | 109 | cd nutch-1.0 |
| | 110 | vim build.xml |
| | 111 | }}} |
| | 112 | |
| | 113 | {{{ |
| | 114 | #!text |
| | 115 | <include name="IKAnalyzer3.1.6GA.jar"/> |
| | 116 | }}} |
| | 117 | |
| | 118 | * 重新編譯 nutch-1.0 |
| | 119 | {{{ |
| | 120 | ant |
| | 121 | }}} |
| | 122 | |
| | 123 | |
| | 124 | * 完成則多一個資料夾 build, |
| | 125 | * build/ 目錄裡面的 nutch-job-1.0.job 就是重編後的核心 |
| | 126 | |
| | 127 | * 接著將 build/classes 內的程式碼打包起來,建立nutch-1.0-ika.jar 函式庫 |
| | 128 | * 補充:我有把預設的 nutch-site.xml 以及 nutch-default.xml放進去一起打包 |
| | 129 | {{{ |
| | 130 | cd nutch-1.0/build/classes |
| | 131 | jar cvf nutch-1.0-ika.jar . |
| | 132 | cp nutch-1.0-ika.jar /opt/nutchez/nutch/lib/ |
| | 133 | }}} |
| | 134 | |
| | 135 | = 開始使用 = |
| | 136 | |
| | 137 | * 最後,將nutch-job-1.0.jar複製到我的nutchez資料夾內取代使用 |
| | 138 | |
| | 139 | {{{ |
| | 140 | cd nutch-1.0 |
| | 141 | sudo mv /opt/nutchez/nutch/nutch-1.0.job /opt/nutchez/nutch/nutch-1.0-ori.job |
| | 142 | sudo cp build/nutch-1.0.job /opt/nutchez/nutch/nutch-1.0-ika-waue-100715.job |
| | 143 | cp build/nutch-1.0.job |
| | 144 | sudo ln -sf /opt/nutchez/nutch/nutch-1.0-ika-waue-100715.job /opt/nutchez/nutch/nutch-1.0.job |
| | 145 | }}} |
| | 146 | |
| | 147 | * 把nutch-1.0.war重新生成的文件夾下lib中的nutch-1.0.jar跟換成你新生成的 nutch-1.0.jar文件,還要放入ik分詞器的jar文件 |
| | 148 | * 最後爬取,搜索的結果就是按ik分過的中文詞 |
| | 149 | {{{ |
| | 150 | cd nutch-1.0/ |
| | 151 | cp lib/IKAnalyzer3.1.6GA.jar /opt/nutchez/nutch/lib/ |
| | 152 | |
| | 153 | cd /opt/nutchez/tomcat/webapps/ROOT/WEB-INF/lib |
| | 154 | cp nutch-1.0/build/nutch-1.0-ika.jar ./ |
| | 155 | cp nutch-1.0/lib/IKAnalyzer3.1.6GA.jar ./ |
| | 156 | }}} |
| | 157 | |
| | 158 | 完成 |