| 1 | <configuration> |
|---|
| 2 | <property> |
|---|
| 3 | <name>http.agent.name</name> |
|---|
| 4 | <value>user</value> |
|---|
| 5 | <description>HTTP 'User-Agent' request header. </description> |
|---|
| 6 | </property> |
|---|
| 7 | <property> |
|---|
| 8 | <name>http.agent.description</name> |
|---|
| 9 | <value>MyTest</value> |
|---|
| 10 | <description>Further description</description> |
|---|
| 11 | </property> |
|---|
| 12 | <property> |
|---|
| 13 | <name>http.agent.url</name> |
|---|
| 14 | <value>localhost</value> |
|---|
| 15 | <description>A URL to advertise in the User-Agent header. </description> |
|---|
| 16 | </property> |
|---|
| 17 | <property> |
|---|
| 18 | <name>http.agent.email</name> |
|---|
| 19 | <value>you@yous</value> |
|---|
| 20 | <description>An email address |
|---|
| 21 | </description> |
|---|
| 22 | </property> |
|---|
| 23 | <property> |
|---|
| 24 | <name>plugin.folders</name> |
|---|
| 25 | <value>/opt/nutchez/nutch/plugins</value> |
|---|
| 26 | <description>Directories where nutch plugins are located. </description> |
|---|
| 27 | </property> |
|---|
| 28 | <property> |
|---|
| 29 | <name>plugin.includes</name> |
|---|
| 30 | <value>protocol-http|urlfilter-regex|parse-(text|html|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
|---|
| 31 | <description> Regular expression naming plugin directory names</description> |
|---|
| 32 | </property> |
|---|
| 33 | <property> |
|---|
| 34 | <name>parse.plugin.file</name> |
|---|
| 35 | <value>parse-plugins.xml</value> |
|---|
| 36 | <description>The name of the file that defines the associations between |
|---|
| 37 | content-types and parsers.</description> |
|---|
| 38 | </property> |
|---|
| 39 | <property> |
|---|
| 40 | <name>db.max.outlinks.per.page</name> |
|---|
| 41 | <value>-1</value> |
|---|
| 42 | <description> </description> |
|---|
| 43 | </property> |
|---|
| 44 | <property> |
|---|
| 45 | <name>http.content.limit</name> |
|---|
| 46 | <value>-1</value> |
|---|
| 47 | </property> |
|---|
| 48 | <property> |
|---|
| 49 | <name>indexer.mergeFactor</name> |
|---|
| 50 | <value>500</value> |
|---|
| 51 | <description>The factor that determines the frequency of Lucene segment |
|---|
| 52 | merges. This must not be less than 2, higher values increase indexing |
|---|
| 53 | speed but lead to increased RAM usage, and increase the number of |
|---|
| 54 | open file handles (which may lead to "Too many open files" errors). |
|---|
| 55 | NOTE: the "segments" here have nothing to do with Nutch segments, they |
|---|
| 56 | are a low-level data unit used by Lucene. |
|---|
| 57 | </description> |
|---|
| 58 | </property> |
|---|
| 59 | |
|---|
| 60 | <property> |
|---|
| 61 | <name>indexer.minMergeDocs</name> |
|---|
| 62 | <value>500</value> |
|---|
| 63 | <description>This number determines the minimum number of Lucene |
|---|
| 64 | Documents buffered in memory between Lucene segment merges. Larger |
|---|
| 65 | values increase indexing speed and increase RAM usage. |
|---|
| 66 | </description> |
|---|
| 67 | </property> |
|---|
| 68 | <property> |
|---|
| 69 | <name>db.ignore.external.links</name> |
|---|
| 70 | <value>false</value> |
|---|
| 71 | <description>If true, outlinks leading from a page to external hosts |
|---|
| 72 | will be ignored. This is an effective way to limit the crawl to include |
|---|
| 73 | only initially injected hosts, without creating complex URLFilters. |
|---|
| 74 | </description> |
|---|
| 75 | </property> |
|---|
| 76 | <property> |
|---|
| 77 | <name>file.content.limit</name> |
|---|
| 78 | <value>1000000</value> |
|---|
| 79 | <description>The length limit for downloaded content, in bytes. |
|---|
| 80 | If this value is nonnegative (>=0), content longer than it will be truncated; |
|---|
| 81 | otherwise, no truncation at all. |
|---|
| 82 | </description> |
|---|
| 83 | </property> |
|---|
| 84 | </configuration> |
|---|
| 85 | |
|---|