java基础:12.4 web爬虫

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/L20902/article/details/86499469

跟随超链接来自动遍历Web.

package day11;
import java.util.ArrayList;
import java.util.Scanner;
public class WebCrawler {
	public static void main(String[] args) {
		Scanner input = new Scanner(System.in);
		System.out.println("enter a URL(such as:http://wwww.xxxx.com):");
		String url = input.nextLine();
		crawler(url);
	}
	
	public static void crawler(String startingURL) {
		ArrayList<String> listOfPendingURLs = new ArrayList<>();
		ArrayList<String> listOfTraversedURLs = new ArrayList<>();
		listOfPendingURLs.add(startingURL) ;
		int i= 0;
		while (!listOfPendingURLs.isEmpty() &&   //is.Empty() :empty return true
				listOfTraversedURLs.size() <= 100) {
			String urlString = listOfPendingURLs.remove(0);  // remove the first url
			if (!listOfTraversedURLs.contains(urlString)) {
				listOfTraversedURLs.add(urlString);
				System.out.println("Crawl " + ++i + "  "+urlString);
				
				for (String s: getSubURLs(urlString)) {
					if (!listOfTraversedURLs.contains(s))
						listOfPendingURLs.add (s);
				}
			}
		}
	}
	
	public static ArrayList<String> getSubURLs(String urlString) {
		ArrayList<String> list = new ArrayList<>() ;
		
		try {
			java.net.URL url = new java.net.URL(urlString);
			Scanner input = new Scanner(url.openStream());
			int current = 0;
			while(input.hasNext()) {
				String line = input.nextLine();
				current = line.indexOf("http:",current);
				while (current > 0) {
					int endIndex = line.indexOf("\"",current);
				    if (endIndex > 0 ) {
				    	list.add(line.substring(current,endIndex));
				    	current = line.indexOf("http:",endIndex);
				    }
				    else 
				    	current = -1;
				}
			}
		}
		catch (Exception ex) {
			System.out.println("error:" + ex.getMessage());
		}
		
		return list;
	}
}

猜你喜欢

转载自blog.csdn.net/L20902/article/details/86499469