网络爬虫源代码.doc-道客多多_道客多多docduoduo.com

资源描述

1、网络爬虫源代码public class Spider implements Runnable private ArrayList urls; /URL 列表private HashMap indexedURLs; /已经检索过的 URL 列表private int threads ; /初始化线程数 public static void main(String argv) throws Exception if(argv0 = null)System.out.println(“Missing required argument: Sit URL“);return ;Spider Spider

2、= new Spider(argv0);Spider.go();public Spider(String strURL) urls = new ArrayList();threads = 10;urls.add(strURL);threadList = new ArrayList();indexedURLs = new HashMap();if (urls.size() = 0)throw new IllegalArgumentException(“Missing required argument: -u start url“);if (threads ; 0) Thread child =

3、 (Thread)threadList.remove(0);child.join();long elapsed = System.currentTimeMillis() - start;public void run() String url;try while (url = dequeueURL() != null) indexURL(url);catch(Exception e) logger.info(e.getMessage(); /检测 URL 列表容器中有没有 URL 没有被解析,如果有则返回 URL 由线程继续执行public synchronized String dequeu

4、eURL() throws Exception while (true) if (urls.size() ; 0) return (String)urls.remove(0);else threads-;if (threads ; 0) wait();threads+;else notifyAll();return null;/* 添加 URL 和当前 URL 的级数，并唤醒睡眠线程 */public synchronized void enqueueURL(String url,int level) if (indexedURLs.get(url) = null) urls.add(url)

5、;indexedURLs.put(url, new Integer(level);notifyAll();/* 通过 URL 解析出网页内容并解析出页面上的 URL* param url 页面链接* throws java.lang.Exception*/private void indexURL(String url) throws Exception boolean flag = true ;/判断网页链接的级别，系统默认为三级int level = 1 ;if (indexedURLs.get(url) = null) indexedURLs.put(url, new Integer(l

6、evel);elselevel = (Integer)indexedURLs.get(url).intValue();/只检测到页面的第二级if(level ; 2 )return ;level+ ;String strBody = null ;try/解析页面内容strBody = loadURL(url);catch(Exception e)return ;if (strBody != null) String urlGroups = null ;try/解析出页面所以 URLurlGroups = parseURLs(summary);catch(Exception e)logger.info(e.getMessage();if(urlGroups = null)urlGroups = new String0 ;strBody = null ;for (int i = 0; i urlGroups.length; i+) enqueueURL(urlGroupsi,level);

展开阅读全文