org.apache.nutch.crawl.Crawl实现的是一个完成的抓取过程,所以由它开始。
/* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr parameter respectively. More information and Usage parameters can be found below. */ public static void main(String args[]) throws Exception { Configuration conf = NutchConfiguration.create(); int res = ToolRunner.run(conf, new Crawl(), args); System.exit(res); }
org.apache.nutch.util.NutchConfiguration
/** * Add the standard Nutch resources to {@link Configuration}. * * @param conf Configuration object to which * configuration is to be added. */ private static Configuration addNutchResources(Configuration conf) { conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); return conf; }
初始化时,加载nutch-default.xml,nutch-site.xml.
@Override public int run(String[] args) throws Exception { if (args.length < 1) { System.out.println ("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]"); return -1; } Path rootUrlDir = null; Path dir = new Path("crawl-" + getDate()); int threads = getConf().getInt("fetcher.threads.fetch", 10); int depth = 5; long topN = Long.MAX_VALUE; String solrUrl = null; //获得输入参数 for (int i = 0; i < args.length; i++) { if ("-dir".equals(args[i])) { dir = new Path(args[i+1]); i++; } else if ("-threads".equals(args[i])) { threads = Integer.parseInt(args[i+1]); i++; } else if ("-depth".equals(args[i])) { depth = Integer.parseInt(args[i+1]); i++; } else if ("-topN".equals(args[i])) { topN = Integer.parseInt(args[i+1]); i++; } else if ("-solr".equals(args[i])) { solrUrl = args[i + 1]; i++; } else if (args[i] != null) { rootUrlDir = new Path(args[i]); } } JobConf job = new NutchJob(getConf()); if (solrUrl == null) { LOG.warn("solrUrl is not set, indexing will be skipped..."); } FileSystem fs = FileSystem.get(job); if (LOG.isInfoEnabled()) { LOG.info("crawl started in: " + dir); LOG.info("rootUrlDir = " + rootUrlDir); LOG.info("threads = " + threads); LOG.info("depth = " + depth); LOG.info("solrUrl=" + solrUrl); if (topN != Long.MAX_VALUE) LOG.info("topN = " + topN); } //建立爬取过程中存放信息的文件夹,对应着各个阶段 Path crawlDb = new Path(dir + "/crawldb"); // Path linkDb = new Path(dir + "/linkdb"); Path segments = new Path(dir + "/segments"); Path indexes = new Path(dir + "/indexes"); Path index = new Path(dir + "/index"); //初始化配置信息 Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate()); Injector injector = new Injector(getConf()); Generator generator = new Generator(getConf()); Fetcher fetcher = new Fetcher(getConf()); ParseSegment parseSegment = new ParseSegment(getConf()); CrawlDb crawlDbTool = new CrawlDb(getConf()); LinkDb linkDbTool = new LinkDb(getConf()); // initialize crawlDb 初始化crawlDb injector.inject(crawlDb, rootUrlDir); int i; for (i = 0; i < depth; i++) { // generate new segment 生成新的抓取队列 Path[] segs = generator.generate(crawlDb, segments, -1, topN, System .currentTimeMillis()); if (segs == null) { LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } fetcher.fetch(segs[0], threads); // fetch it 抓取 if (!Fetcher.isParsing(job)) { parseSegment.parse(segs[0]); // parse it, if needed 解析 } crawlDbTool.update(crawlDb, segs, true, true); // update crawldb 更新crawlDb数据库 } if (i > 0) { linkDbTool.invert(linkDb, segments, true, true, false); // invert links 计算反向链接 if (solrUrl != null) { // index, dedup & merge 使用solr建立索引 FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs)); SolrIndexer indexer = new SolrIndexer(getConf()); indexer.indexSolr(solrUrl, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats))); SolrDeleteDuplicates dedup = new SolrDeleteDuplicates(); dedup.setConf(getConf()); dedup.dedup(solrUrl); } } else { LOG.warn("No URLs to fetch - check your seed list and URL filters."); } if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); } return 0; }
相关推荐
apache-nutch-2.3.1-src.tar.gz
lucene+nutch搜索引擎光盘源码(1-8章),一次上传不了那么多所以分卷了。
nutch配置nutch-default.xml
apache-nutch-1.3 的源码包,需要的可以看下
外网不能访问,故上传,一方面自己备份,一方面也方便大家不能下载的痛苦,只有nutch的源码,没有依赖包,如果需要依赖包,请自行下载
nutch-param-setnutch-param-setnutch-param-setnutch-param-set
apache-nutch-2.3.1-src.tar ,网络爬虫的源码, 用ivy2管理, ant runtime 编译 apache-nutch-2.3.1-src.tar ,网络爬虫的源码, 用ivy2管理, ant runtime 编译
Nutch是一款刚刚诞生的完整的开源搜索引擎系统,可以结合数据库进行索引,能快速构建所需系统。Nutch 是基于Lucene的,Lucene为 Nutch 提供了文本索引和搜索的API,所以它使用Lucene作为索引和检索的模块。Nutch的...
lucene+nutch搜索引擎(1-11章源码)
Nutch 是一个开源Java 实现的搜索引擎。这里是它的安装包。
apache-nutch-1.6-src.tar.gz 来自APACHE官网,本人亲自测试可以使用。
如果想把多次用nutch crawl获得的所有目录合并在一起。可以按以下步骤进行
nutch不用安装,是个应用程序,下载后为nutch-1.6.tar.gz,双击桌面上的cygwin快捷方式;执行以下命令: $ cd D:/Downloads/Soft $ tar zxvf nutch-1.0.tar.gz 在e盘下面出现nutch-0.9文件夹说明解压成功了.然后环境...
apache-nutch-1.4-bin.part1
apache-nutch-1.4-bin.tar.gz.part2
Nutch 1.2 学习笔记,讲的比较清楚的文档
Nutch 是一个开源Java 实现的搜索引擎。它提供了我们运行自己的搜索引擎所需的全部工具。包括全文搜索和Web爬虫。包含nutch-1.5.1的源码
apache-nutch-1.4-bin.tar.gz.part1
学习nutch 源码解读 轻松入门 搭建自己的nutch搜索引擎
一个开源Java 实现的搜索引擎nutch