Changes between Initial Version and Version 1 of waue/2011/0725


Ignore:
Timestamp:
Jul 25, 2011, 6:00:26 PM (13 years ago)
Author:
waue
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • waue/2011/0725

    v1 v1  
     1{{{
     2#!text
     3Merging 18 segments to /user/crawler/newcrawl_3/segments/20110725175250
     4SegmentMerger:   adding /user/crawler/crawl1/segments/20110722163308/content
     5SegmentMerger:   adding /user/crawler/crawl1/segments/20110722163308/crawl_fetch
     6SegmentMerger:   adding /user/crawler/crawl1/segments/20110722163308/crawl_generate
     7SegmentMerger:   adding /user/crawler/crawl1/segments/20110722163308/crawl_parse
     8SegmentMerger:   adding /user/crawler/crawl1/segments/20110722163308/parse_data
     9SegmentMerger:   adding /user/crawler/crawl1/segments/20110722163308/parse_text
     10SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151117/content
     11SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151117/crawl_fetch
     12SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151117/crawl_generate
     13SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151117/crawl_parse
     14SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151117/parse_data
     15SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151117/parse_text
     16SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151312/content
     17SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151312/crawl_fetch
     18SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151312/crawl_generate
     19SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151312/crawl_parse
     20SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151312/parse_data
     21SegmentMerger:   adding /user/crawler/crawl2/segments/20110531151312/parse_text
     22SegmentMerger: using segment data from:
     23Exception in thread "main" java.io.IOException: No input paths specified in job
     24        at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:152)
     25        at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:44)
     26        at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:201)
     27        at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810)
     28        at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781)
     29        at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730)
     30        at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249)
     31        at org.apache.nutch.segment.SegmentMerger.merge(SegmentMerger.java:638)
     32        at org.apache.nutch.segment.SegmentMerger.main(SegmentMerger.java:683)
     33Update segments
     34LinkDb: starting at 2011-07-25 17:52:55
     35LinkDb: linkdb: /user/crawler/newcrawl_3/linkdb
     36LinkDb: URL normalize: true
     37LinkDb: URL filter: true
     38LinkDb: java.io.IOException: No input paths specified in job
     39        at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:152)
     40        at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:44)
     41        at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:201)
     42        at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810)
     43        at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781)
     44        at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730)
     45        at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249)
     46        at org.apache.nutch.crawl.LinkDb.invert(LinkDb.java:175)
     47        at org.apache.nutch.crawl.LinkDb.run(LinkDb.java:292)
     48        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
     49        at org.apache.nutch.crawl.LinkDb.main(LinkDb.java:255)
     50
     51Index segments
     52ls: Cannot access /user/crawler/newcrawl_3/segments/*: No such file or directory.
     53[check] /opt/crawlzilla/nutch/bin/nutch index /user/crawler/newcrawl_3/newindexes /user/crawler/newcrawl_3/crawldb /user/crawler/newcrawl_3/linkdb
     54Usage: Indexer <index> <crawldb> <linkdb> <segment> ...
     55De-duplicate indexes
     56Dedup: starting at 2011-07-25 17:53:02
     57Dedup: adding indexes in: /user/crawler/newcrawl_3/newindexes
     58DeleteDuplicates: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://crawlweb1:9000/user/crawler/newcrawl_3/newindexes
     59        at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:190)
     60        at org.apache.nutch.indexer.DeleteDuplicates$InputFormat.getSplits(DeleteDuplicates.java:149)
     61        at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810)
     62        at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781)
     63        at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730)
     64        at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249)
     65        at org.apache.nutch.indexer.DeleteDuplicates.dedup(DeleteDuplicates.java:451)
     66        at org.apache.nutch.indexer.DeleteDuplicates.run(DeleteDuplicates.java:519)
     67        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
     68        at org.apache.nutch.indexer.DeleteDuplicates.main(DeleteDuplicates.java:503)
     69
     70Merge indexes
     71IndexMerger: starting at 2011-07-25 17:53:07
     72IndexMerger: merging indexes to: /user/crawler/newcrawl_3/index
     73IndexMerger: finished at 2011-07-25 17:53:07, elapsed: 00:00:00
     74Some stats
     75CrawlDb statistics start: /user/crawler/newcrawl_3/crawldb
     76Statistics for CrawlDb: /user/crawler/newcrawl_3/crawldb
     77TOTAL urls:     514
     78retry 0:        514
     79min score:      0.0
     80avg score:      0.010715953
     81max score:      1.076
     82status 1 (db_unfetched):        454
     83status 2 (db_fetched):  52
     84status 3 (db_gone):     2
     85status 5 (db_redir_perm):       6
     86CrawlDb statistics: done
     87finish on : /home/crawler/newcrawl_3
     88
     89}}}